[clang] [llvm] [AMDGPU] Don't allow wgp mode on gfx1250 (PR #153680)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Aug 14 13:57:42 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
@llvm/pr-subscribers-clang-driver
Author: Stanislav Mekhanoshin (rampitec)
<details>
<summary>Changes</summary>
- gfx1250 only supports cu mode
---
Patch is 156.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153680.diff
13 Files Affected:
- (modified) clang/test/CodeGenHIP/hip-cumode.hip (+8-2)
- (modified) clang/test/Driver/hip-macros.hip (+10-4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (+4-1)
- (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+3-2)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+5-1)
- (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp (+2-1)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+18-3)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+51-50)
- (modified) llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll (+141-223)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll (+1211)
- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+144-144)
- (modified) llvm/test/MC/AMDGPU/hsa-diag-v4.s (+25-8)
``````````diff
diff --git a/clang/test/CodeGenHIP/hip-cumode.hip b/clang/test/CodeGenHIP/hip-cumode.hip
index 1aa1ca7a1a7ee..61fd53c644e8c 100644
--- a/clang/test/CodeGenHIP/hip-cumode.hip
+++ b/clang/test/CodeGenHIP/hip-cumode.hip
@@ -5,14 +5,20 @@
// RUN: %clang -S -o - --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
// RUN: %s 2>&1 | FileCheck --check-prefix=NOWGP %s
// RUN: %clang -S -o - --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
-// RUN: %s 2>&1 | FileCheck --check-prefixes=NOWGP,WARN-CUMODE %s
+// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=NOWGP,WARN-CUMODE %s
// RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
// RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mcumode \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
// RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
-// WARN-CUMODE: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
+// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib \
+// RUN: %s 2>&1 | FileCheck --check-prefix=NOWGP %s
+// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mcumode \
+// RUN: %s 2>&1 | FileCheck --check-prefix=NOWGP %s
+// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
+// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx1250 --check-prefixes=NOWGP,WARN-CUMODE %s
+// WARN-CUMODE: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
// NOWGP-NOT: .amdhsa_workgroup_processor_mode
// CUMODE-ON: .amdhsa_workgroup_processor_mode 0
// CUMODE-OFF: .amdhsa_workgroup_processor_mode 1
diff --git a/clang/test/Driver/hip-macros.hip b/clang/test/Driver/hip-macros.hip
index bd93f9985a774..516e01a6c4743 100644
--- a/clang/test/Driver/hip-macros.hip
+++ b/clang/test/Driver/hip-macros.hip
@@ -27,21 +27,27 @@
// RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
// RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
-// RUN: %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s
+// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
// RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
// RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mcumode \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
// RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
+// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib \
+// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
+// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mcumode \
+// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
+// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
+// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx1250 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
// Check no duplicate warnings.
// RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
// RUN: -mno-cumode -mno-cumode \
-// RUN: %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s
+// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
-// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
-// WARN-CUMODE-NOT: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
+// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
+// WARN-CUMODE-NOT: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
// CUMODE-ON-DAG: #define __AMDGCN_CUMODE__ 1
// CUMODE-OFF-DAG: #define __AMDGCN_CUMODE__ 0
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c7d2d268a2707..188c126cb9fbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1144,8 +1144,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
CreateExpr(STM.getWavefrontSize()), Ctx),
CreateExpr(1ULL << ScratchAlignShift));
- if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
+ if (STM.supportsWGP()) {
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
+ }
+
+ if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
ProgInfo.MemOrdered = 1;
ProgInfo.FwdProgress = 1;
}
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 0184075c2c909..951473264d089 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -6270,8 +6270,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
ExprVal, ValRange);
} else if (ID == ".amdhsa_workgroup_processor_mode") {
- if (IVersion.Major < 10)
- return Error(IDRange.Start, "directive requires gfx10+", IDRange);
+ if (!supportsWGP(getSTI()))
+ return Error(IDRange.Start,
+ "directive unsupported on " + getSTI().getCPU(), IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal,
ValRange);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f47ddf5d93ec3..7ca7e8448c63d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -390,7 +390,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// the original value.
bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
- bool supportsWGP() const { return getGeneration() >= GFX10; }
+ bool supportsWGP() const {
+ if (GFX1250Insts)
+ return false;
+ return getGeneration() >= GFX10;
+ }
bool hasIntClamp() const {
return HasIntClamp;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 68302f0dd0d64..1f35e92151bfc 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -563,11 +563,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PrintField(KD.compute_pgm_rsrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split");
- if (IVersion.Major >= 10) {
+ if (AMDGPU::supportsWGP(STI))
PrintField(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
".amdhsa_workgroup_processor_mode");
+ if (IVersion.Major >= 10) {
PrintField(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ec9f1abdd8467..c41d62748c4be 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1167,12 +1167,21 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
// "Per CU" really means "per whatever functional block the waves of a
- // workgroup must share". For gfx10 in CU mode this is the CU, which contains
+ // workgroup must share".
+
+ // GFX12.5 only supports CU mode, which contains four SIMDs.
+ if (isGFX1250(*STI)) {
+ assert(STI->getFeatureBits().test(FeatureCuMode));
+ return 4;
+ }
+
+ // For gfx10 in CU mode the functional block is the CU, which contains
// two SIMDs.
if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode))
return 2;
- // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
- // two CUs, so a total of four SIMDs.
+
+ // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP
+ // contains two CUs, so a total of four SIMDs.
return 4;
}
@@ -2480,6 +2489,12 @@ bool isGFX1250(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts];
}
+bool supportsWGP(const MCSubtargetInfo &STI) {
+ if (isGFX1250(STI))
+ return false;
+ return isGFX10Plus(STI);
+}
+
bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); }
bool isNotGFX10Plus(const MCSubtargetInfo &STI) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 704bf106ace76..befab68bb5698 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1549,6 +1549,7 @@ bool isGFX11Plus(const MCSubtargetInfo &STI);
bool isGFX12(const MCSubtargetInfo &STI);
bool isGFX12Plus(const MCSubtargetInfo &STI);
bool isGFX1250(const MCSubtargetInfo &STI);
+bool supportsWGP(const MCSubtargetInfo &STI);
bool isNotGFX12Plus(const MCSubtargetInfo &STI);
bool isNotGFX11Plus(const MCSubtargetInfo &STI);
bool isGCN3Encoding(const MCSubtargetInfo &STI);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 3daae98961bff..01854c8560ce2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2854,89 +2854,90 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v14, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v12, 0
+; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v0, v10, 0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1]
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17]
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v8, 0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
-; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
-; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
+; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
+; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2
-; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23, v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, v15
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20, s4
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v27, s0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25, s1
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
-; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v0
-; GFX1250-NEXT: v_mov_b32_e32 v0, v16
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo
+; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1
+; GFX1250-NEXT: v_mov_b32_e32 v1, v14
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index f0db321d3931a..e532deaca98a8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
-; Test S_WAIT_XCNT in...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/153680
More information about the cfe-commits
mailing list