[llvm] ce72f78 - [AMDGPU] Fix mul combine for MUL24 (#79110)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 29 07:37:24 PST 2024
Author: Pierre van Houtryve
Date: 2024-01-29T16:37:20+01:00
New Revision: ce72f78f37199d693a65b6c7c1d637fafbb13727
URL: https://github.com/llvm/llvm-project/commit/ce72f78f37199d693a65b6c7c1d637fafbb13727
DIFF: https://github.com/llvm/llvm-project/commit/ce72f78f37199d693a65b6c7c1d637fafbb13727.diff
LOG: [AMDGPU] Fix mul combine for MUL24 (#79110)
MUL24 can now return a i64 for i32 operands, but the combine was never
updated to handle this case. Extend the operand when rewriting the ADD
to handle it.
Fixes SWDEV-436654
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 55d95154c75878..b420e72d87ed09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4206,6 +4206,7 @@ static SDValue getAddOneOp(const SDNode *V) {
SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ assert(N->getOpcode() == ISD::MUL);
EVT VT = N->getValueType(0);
// Don't generate 24-bit multiplies on values that are in SGPRs, since
@@ -4254,10 +4255,6 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
}
- // Skip if already mul24.
- if (N->getOpcode() != ISD::MUL)
- return SDValue();
-
// There are i16 integer mul/mad.
if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
return SDValue();
@@ -5081,7 +5078,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::MUL_I24: {
if (SDValue Simplified = simplifyMul24(N, DCI))
return Simplified;
- return performMulCombine(N, DCI);
+ break;
}
case AMDGPUISD::MULHI_I24:
case AMDGPUISD::MULHI_U24:
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index 3c654e9e2c9e15..77e1694dbe7e19 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -338,25 +338,29 @@ define i24 @v_mul_add_1_i24_zext(i24 zeroext %x, i24 zeroext %y) {
; GFX67-LABEL: v_mul_add_1_i24_zext:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1
+; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_add_1_i24_zext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_add_1_i24_zext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_add_1_i24_zext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%add = add i24 %y, 1
%mul = mul i24 %x, %add
@@ -429,25 +433,29 @@ define i24 @v_mul_add_1_i24_sext(i24 signext %x, i24 signext %y) {
; GFX67-LABEL: v_mul_add_1_i24_sext:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1
+; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_add_1_i24_sext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_add_1_i24_sext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_add_1_i24_sext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, v1, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%add = add i24 %y, 1
%mul = mul i24 %x, %add
@@ -2306,29 +2314,37 @@ define <2 x i24> @v_mul_add_1_v2i24(<2 x i24> %x, <2 x i24> %y) {
; GFX67-LABEL: v_mul_add_1_v2i24:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v3
+; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v2
+; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2
+; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_add_1_v2i24:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v2
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v2
+; GFX8-NEXT: v_mul_u32_u24_e32 v1, v1, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_add_1_v2i24:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX9-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX9-NEXT: v_add_u32_e32 v3, 1, v3
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v2
+; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v2
+; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_add_1_v2i24:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX10-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v3
+; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v2
+; GFX10-NEXT: v_mul_u32_u24_e32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
%add = add <2 x i24> %y, <i24 1, i24 1>
%mul = mul <2 x i24> %x, %add
@@ -2339,29 +2355,37 @@ define <2 x i24> @v_mul_add_1_v2i24_commute(<2 x i24> %x, <2 x i24> %y) {
; GFX67-LABEL: v_mul_add_1_v2i24_commute:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v3
+; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v2
+; GFX67-NEXT: v_mul_u32_u24_e32 v0, v2, v0
+; GFX67-NEXT: v_mul_u32_u24_e32 v1, v3, v1
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_add_1_v2i24_commute:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v2
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, v2, v0
+; GFX8-NEXT: v_mul_u32_u24_e32 v1, v3, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_add_1_v2i24_commute:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX9-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX9-NEXT: v_add_u32_e32 v3, 1, v3
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v2
+; GFX9-NEXT: v_mul_u32_u24_e32 v0, v2, v0
+; GFX9-NEXT: v_mul_u32_u24_e32 v1, v3, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_add_1_v2i24_commute:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, v2, v0
-; GFX10-NEXT: v_mad_u32_u24 v1, v1, v3, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v3
+; GFX10-NEXT: v_mul_u32_u24_e32 v0, v2, v0
+; GFX10-NEXT: v_mul_u32_u24_e32 v1, v3, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%add = add <2 x i24> %y, <i24 1, i24 1>
%mul = mul <2 x i24> %add, %x
@@ -3692,10 +3716,186 @@ define <2 x i8> @v_mul_add_1_v2i8_commute(<2 x i8> %x, <2 x i8> %y) {
ret <2 x i8> %mul
}
+; test mul_u24 intrinsic with (i32, i32) -> i64
+define i64 @mul_u24_with_uneven_operands(i32 %z) {
+; GFX67-LABEL: mul_u24_with_uneven_operands:
+; GFX67: ; %bb.0: ; %entry
+; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v0
+; GFX67-NEXT: v_mul_u32_u24_e32 v0, v1, v0
+; GFX67-NEXT: v_mov_b32_e32 v1, 0
+; GFX67-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: mul_u24_with_uneven_operands:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: mul_u24_with_uneven_operands:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v0
+; GFX9-NEXT: v_mul_u32_u24_e32 v0, v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_u24_with_uneven_operands:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0
+; GFX10-NEXT: v_mul_u32_u24_e32 v0, v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %c = and i32 %z, 1
+ %d = add nuw nsw i32 %c, 1
+ %f = call i64 @llvm.amdgcn.mul.u24(i32 %d, i32 %c)
+ ret i64 %f
+}
+
+define i64 @mul_u24_with_uneven_operands_swapped(i32 %z) {
+; GFX67-LABEL: mul_u24_with_uneven_operands_swapped:
+; GFX67: ; %bb.0: ; %entry
+; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v0
+; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT: v_mov_b32_e32 v1, 0
+; GFX67-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: mul_u24_with_uneven_operands_swapped:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: mul_u24_with_uneven_operands_swapped:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v0
+; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_u24_with_uneven_operands_swapped:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0
+; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %c = and i32 %z, 1
+ %d = add nuw nsw i32 %c, 1
+ %f = call i64 @llvm.amdgcn.mul.u24(i32 %c, i32 %d)
+ ret i64 %f
+}
+
+; test mul_i24 intrinsic with (i32, i32) -> i64
+define i64 @mul_i24_with_uneven_operands(i32 %z) {
+; GFX67-LABEL: mul_i24_with_uneven_operands:
+; GFX67: ; %bb.0: ; %entry
+; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v0
+; GFX67-NEXT: v_mul_hi_i32_i24_e32 v1, v2, v0
+; GFX67-NEXT: v_mul_i32_i24_e32 v0, v2, v0
+; GFX67-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: mul_i24_with_uneven_operands:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; GFX8-NEXT: v_mul_hi_i32_i24_e32 v1, v2, v0
+; GFX8-NEXT: v_mul_i32_i24_e32 v0, v2, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: mul_i24_with_uneven_operands:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v2, v0
+; GFX9-NEXT: v_mul_i32_i24_e32 v0, v2, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_i24_with_uneven_operands:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 1, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v1
+; GFX10-NEXT: v_mul_i32_i24_e32 v0, v2, v1
+; GFX10-NEXT: v_mul_hi_i32_i24_e32 v1, v2, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %c = and i32 %z, 1
+ %d = add nuw nsw i32 %c, 1
+ %f = call i64 @llvm.amdgcn.mul.i24(i32 %d, i32 %c)
+ ret i64 %f
+}
+
+define i64 @mul_i24_with_uneven_operands_swapped(i32 %z) {
+; GFX67-LABEL: mul_i24_with_uneven_operands_swapped:
+; GFX67: ; %bb.0: ; %entry
+; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v0
+; GFX67-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2
+; GFX67-NEXT: v_mul_i32_i24_e32 v0, v0, v2
+; GFX67-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: mul_i24_with_uneven_operands_swapped:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; GFX8-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2
+; GFX8-NEXT: v_mul_i32_i24_e32 v0, v0, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: mul_i24_with_uneven_operands_swapped:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
+; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2
+; GFX9-NEXT: v_mul_i32_i24_e32 v0, v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_i24_with_uneven_operands_swapped:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 1, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v1
+; GFX10-NEXT: v_mul_i32_i24_e32 v0, v1, v2
+; GFX10-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %c = and i32 %z, 1
+ %d = add nuw nsw i32 %c, 1
+ %f = call i64 @llvm.amdgcn.mul.i24(i32 %c, i32 %d)
+ ret i64 %f
+}
+
declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #2
declare i32 @llvm.amdgcn.workitem.id.x() #2
declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #2
declare i32 @llvm.amdgcn.workgroup.id.x() #2
+declare i64 @llvm.amdgcn.mul.u24(i32, i32)
+declare i64 @llvm.amdgcn.mul.i24(i32, i32)
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(read, argmem: readwrite, inaccessiblemem: none) }
More information about the llvm-commits
mailing list