[llvm] [AMDGPU] Fix mul combine for MUL24 (PR #79110)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 28 22:28:25 PST 2024
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/79110
>From ef01c948c7658543661f0a54abd93f893785129d Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 23 Jan 2024 09:42:02 +0100
Subject: [PATCH 1/3] [AMDGPU] Fix mul combine for MUL24
MUL24 can now return a i64 for i32 operands, but the combine was never updated to handle this case.
Extend the operand when rewriting the ADD to handle it.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 +-
llvm/test/CodeGen/AMDGPU/mul-combine-crash.ll | 47 +++++++++++++++++++
2 files changed, 49 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/mul-combine-crash.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 55d95154c75878b..109e86eb4117a2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4246,12 +4246,12 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
// operands, so we have to place the mul in the LHS
if (SDValue MulOper = IsFoldableAdd(N0)) {
SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
- return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
+ return DAG.getNode(ISD::ADD, DL, VT, MulVal, DAG.getZExtOrTrunc(N1, DL, VT));
}
if (SDValue MulOper = IsFoldableAdd(N1)) {
SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
- return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
+ return DAG.getNode(ISD::ADD, DL, VT, MulVal, DAG.getZExtOrTrunc(N0, DL, VT));
}
// Skip if already mul24.
diff --git a/llvm/test/CodeGen/AMDGPU/mul-combine-crash.ll b/llvm/test/CodeGen/AMDGPU/mul-combine-crash.ll
new file mode 100644
index 000000000000000..624c8aa859c0939
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mul-combine-crash.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; Checks that the DAG mul combine can handle a MUL24 with a i32 and i64
+; operand.
+
+define i64 @test(i64 %x, i32 %z) {
+; CHECK-LABEL: test:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
+; CHECK-NEXT: v_and_b32_e32 v2, 1, v2
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %a = add i64 %x, 0
+ %b = and i64 %a, 255
+ %c = and i32 %z, 1
+ %d = add nuw nsw i32 %c, 1
+ %e = zext nneg i32 %d to i64
+ %f = mul nuw nsw i64 %b, %e
+ %g = add nuw nsw i64 %f, 0
+ ret i64 %g
+}
+
+define i64 @test_swapped(i64 %x, i32 %z) {
+; CHECK-LABEL: test_swapped:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
+; CHECK-NEXT: v_and_b32_e32 v2, 1, v2
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %a = add i64 %x, 0
+ %b = and i64 %a, 255
+ %c = and i32 %z, 1
+ %d = add nuw nsw i32 %c, 1
+ %e = zext nneg i32 %d to i64
+ %f = mul nuw nsw i64 %e, %b
+ %g = add nuw nsw i64 %f, 0
+ ret i64 %g
+}
>From d43874c0654e54aa8edcf2d6c10d09de44128a4c Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 23 Jan 2024 09:49:21 +0100
Subject: [PATCH 2/3] clang-format
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 109e86eb4117a2f..484a9f31a45c761 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4246,12 +4246,14 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
// operands, so we have to place the mul in the LHS
if (SDValue MulOper = IsFoldableAdd(N0)) {
SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
- return DAG.getNode(ISD::ADD, DL, VT, MulVal, DAG.getZExtOrTrunc(N1, DL, VT));
+ return DAG.getNode(ISD::ADD, DL, VT, MulVal,
+ DAG.getZExtOrTrunc(N1, DL, VT));
}
if (SDValue MulOper = IsFoldableAdd(N1)) {
SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
- return DAG.getNode(ISD::ADD, DL, VT, MulVal, DAG.getZExtOrTrunc(N0, DL, VT));
+ return DAG.getNode(ISD::ADD, DL, VT, MulVal,
+ DAG.getZExtOrTrunc(N0, DL, VT));
}
// Skip if already mul24.
>From 236b0fd0de4db3e29e75050464b8b706e71111c6 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 29 Jan 2024 07:28:13 +0100
Subject: [PATCH 3/3] disable combine for mul24
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 12 +---
llvm/test/CodeGen/AMDGPU/mul-combine-crash.ll | 14 ++--
.../AMDGPU/reassoc-mul-add-1-to-mad.ll | 72 ++++++++++++-------
3 files changed, 57 insertions(+), 41 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 484a9f31a45c761..97cef8468c26d8a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4246,20 +4246,14 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
// operands, so we have to place the mul in the LHS
if (SDValue MulOper = IsFoldableAdd(N0)) {
SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
- return DAG.getNode(ISD::ADD, DL, VT, MulVal,
- DAG.getZExtOrTrunc(N1, DL, VT));
+ return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
}
if (SDValue MulOper = IsFoldableAdd(N1)) {
SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
- return DAG.getNode(ISD::ADD, DL, VT, MulVal,
- DAG.getZExtOrTrunc(N0, DL, VT));
+ return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
}
- // Skip if already mul24.
- if (N->getOpcode() != ISD::MUL)
- return SDValue();
-
// There are i16 integer mul/mad.
if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
return SDValue();
@@ -5083,7 +5077,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::MUL_I24: {
if (SDValue Simplified = simplifyMul24(N, DCI))
return Simplified;
- return performMulCombine(N, DCI);
+ break;
}
case AMDGPUISD::MULHI_I24:
case AMDGPUISD::MULHI_U24:
diff --git a/llvm/test/CodeGen/AMDGPU/mul-combine-crash.ll b/llvm/test/CodeGen/AMDGPU/mul-combine-crash.ll
index 624c8aa859c0939..fff68c9235cc442 100644
--- a/llvm/test/CodeGen/AMDGPU/mul-combine-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul-combine-crash.ll
@@ -8,10 +8,9 @@ define i64 @test(i64 %x, i32 %z) {
; CHECK-LABEL: test:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
-; CHECK-NEXT: v_and_b32_e32 v2, 1, v2
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[0:1]
+; CHECK-NEXT: v_and_b32_e32 v1, 1, v2
+; CHECK-NEXT: v_add_u32_e32 v1, 1, v1
+; CHECK-NEXT: v_mul_u32_u24_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -29,10 +28,9 @@ define i64 @test_swapped(i64 %x, i32 %z) {
; CHECK-LABEL: test_swapped:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
-; CHECK-NEXT: v_and_b32_e32 v2, 1, v2
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[0:1]
+; CHECK-NEXT: v_and_b32_e32 v1, 1, v2
+; CHECK-NEXT: v_add_u32_e32 v1, 1, v1
+; CHECK-NEXT: v_mul_u32_u24_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index 3c654e9e2c9e155..02a20444a4bf676 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -338,25 +338,29 @@ define i24 @v_mul_add_1_i24_zext(i24 zeroext %x, i24 zeroext %y) {
; GFX67-LABEL: v_mul_add_1_i24_zext:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1
+; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_add_1_i24_zext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_add_1_i24_zext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_add_1_i24_zext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%add = add i24 %y, 1
%mul = mul i24 %x, %add
@@ -429,25 +433,29 @@ define i24 @v_mul_add_1_i24_sext(i24 signext %x, i24 signext %y) {
; GFX67-LABEL: v_mul_add_1_i24_sext:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1
+; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_add_1_i24_sext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_add_1_i24_sext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_add_1_i24_sext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%add = add i24 %y, 1
%mul = mul i24 %x, %add
@@ -2306,29 +2314,37 @@ define <2 x i24> @v_mul_add_1_v2i24(<2 x i24> %x, <2 x i24> %y) {
; GFX67-LABEL: v_mul_add_1_v2i24:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v3
+; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v2
+; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2
+; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_add_1_v2i24:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v2
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v2
+; GFX8-NEXT: v_mul_u32_u24_e32 v1, v1, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_add_1_v2i24:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX9-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX9-NEXT: v_add_u32_e32 v3, 1, v3
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v2
+; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v2
+; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_add_1_v2i24:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX10-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v3
+; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v2
+; GFX10-NEXT: v_mul_u32_u24_e32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
%add = add <2 x i24> %y, <i24 1, i24 1>
%mul = mul <2 x i24> %x, %add
@@ -2339,29 +2355,37 @@ define <2 x i24> @v_mul_add_1_v2i24_commute(<2 x i24> %x, <2 x i24> %y) {
; GFX67-LABEL: v_mul_add_1_v2i24_commute:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v3
+; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v2
+; GFX67-NEXT: v_mul_u32_u24_e32 v0, v2, v0
+; GFX67-NEXT: v_mul_u32_u24_e32 v1, v3, v1
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_add_1_v2i24_commute:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v2
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, v2, v0
+; GFX8-NEXT: v_mul_u32_u24_e32 v1, v3, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_add_1_v2i24_commute:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX9-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX9-NEXT: v_add_u32_e32 v3, 1, v3
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v2
+; GFX9-NEXT: v_mul_u32_u24_e32 v0, v2, v0
+; GFX9-NEXT: v_mul_u32_u24_e32 v1, v3, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_add_1_v2i24_commute:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX10-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v3
+; GFX10-NEXT: v_mul_u32_u24_e32 v0, v2, v0
+; GFX10-NEXT: v_mul_u32_u24_e32 v1, v3, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%add = add <2 x i24> %y, <i24 1, i24 1>
%mul = mul <2 x i24> %add, %x
More information about the llvm-commits
mailing list