[llvm] r329510 - [X86] Combine vXi64 multiplies to MULDQ/MULUDQ during DAG combine instead of lowering.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 7 12:09:52 PDT 2018
Author: ctopper
Date: Sat Apr 7 12:09:52 2018
New Revision: 329510
URL: http://llvm.org/viewvc/llvm-project?rev=329510&view=rev
Log:
[X86] Combine vXi64 multiplies to MULDQ/MULUDQ during DAG combine instead of lowering.
Previously we used a custom lowering for this because of the AVX1 splitting requirement. But we can do the split during DAG combine if we check the types and subtarget
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/combine-pmuldq.ll
llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
llvm/trunk/test/CodeGen/X86/mulvi32.ll
llvm/trunk/test/CodeGen/X86/pmul.ll
llvm/trunk/test/CodeGen/X86/xop-ifma.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=329510&r1=329509&r2=329510&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sat Apr 7 12:09:52 2018
@@ -1344,6 +1344,8 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
+
+ setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}
if (Subtarget.hasCDI()) {
@@ -1432,6 +1434,8 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::UINT_TO_FP, VT, Legal);
setOperationAction(ISD::FP_TO_SINT, VT, Legal);
setOperationAction(ISD::FP_TO_UINT, VT, Legal);
+
+ setOperationAction(ISD::MUL, VT, Legal);
}
}
@@ -5117,10 +5121,11 @@ static SDValue widenSubVector(MVT VT, SD
template <typename F>
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
- F Builder) {
+ F Builder, bool CheckBWI = true) {
assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
unsigned NumSubs = 1;
- if (Subtarget.useBWIRegs()) {
+ if ((CheckBWI && Subtarget.useBWIRegs()) ||
+ (!CheckBWI && Subtarget.useAVX512Regs())) {
if (VT.getSizeInBits() > 512) {
NumSubs = VT.getSizeInBits() / 512;
assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
@@ -22516,13 +22521,7 @@ static SDValue LowerMUL(SDValue Op, cons
assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply");
-
- // MULDQ returns the 64-bit result of the signed multiplication of the lower
- // 32-bits. We can lower with this if the sign bits stretch that far.
- if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
- DAG.ComputeNumSignBits(B) > 32) {
- return DAG.getNode(X86ISD::PMULDQ, dl, VT, A, B);
- }
+ assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);
@@ -22545,11 +22544,6 @@ static SDValue LowerMUL(SDValue Op, cons
bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
- // If DQI is supported we can use MULLQ, but MULUDQ is still better if the
- // the high bits are known to be zero.
- if (Subtarget.hasDQI() && (!AHiIsZero || !BHiIsZero))
- return Op;
-
SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
// Only multiply lo/hi halves that aren't known to be zero.
@@ -33126,6 +33120,47 @@ static SDValue combineMulToPMADDWD(SDNod
PMADDWDBuilder);
}
+static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ // Only support vXi64 vectors.
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // MULDQ returns the 64-bit result of the signed multiplication of the lower
+ // 32-bits. We can lower with this if the sign bits stretch that far.
+ if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
+ DAG.ComputeNumSignBits(N1) > 32) {
+ auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
+ PMULDQBuilder, /*CheckBWI*/false);
+ }
+
+ // If the upper bits are zero we can use a single pmuludq.
+ APInt Mask = APInt::getHighBitsSet(64, 32);
+ if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
+ auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
+ };
+ return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
+ PMULUDQBuilder, /*CheckBWI*/false);
+ }
+
+ return SDValue();
+}
+
/// Optimize a single multiply with constant into two operations in order to
/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
@@ -33136,6 +33171,9 @@ static SDValue combineMul(SDNode *N, Sel
if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
return V;
+ if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
+ return V;
+
if (DCI.isBeforeLegalize() && VT.isVector())
return reduceVMULWidth(N, DAG, Subtarget);
@@ -35738,7 +35776,7 @@ static SDValue combineTruncatedArithmeti
// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
// better to truncate if we have the chance.
if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
- !Subtarget.hasDQI())
+ !TLI.isOperationLegal(Opcode, SrcVT))
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
LLVM_FALLTHROUGH;
case ISD::ADD: {
Modified: llvm/trunk/test/CodeGen/X86/combine-pmuldq.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-pmuldq.ll?rev=329510&r1=329509&r2=329510&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-pmuldq.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-pmuldq.ll Sat Apr 7 12:09:52 2018
@@ -5,23 +5,14 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=AVX --check-prefix=AVX512DQVL
-; TODO - shuffle+sext are superfluous
define <2 x i64> @combine_shuffle_sext_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: combine_shuffle_sext_pmuldq:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pmovsxdq %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; SSE-NEXT: pmovsxdq %xmm0, %xmm0
-; SSE-NEXT: pmuldq %xmm2, %xmm0
+; SSE-NEXT: pmuldq %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_shuffle_sext_pmuldq:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX-NEXT: vpmovsxdq %xmm1, %xmm1
; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
@@ -32,23 +23,14 @@ define <2 x i64> @combine_shuffle_sext_p
ret <2 x i64> %5
}
-; TODO - shuffle+zext are superfluous
define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: combine_shuffle_zext_pmuludq:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: pmuludq %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_shuffle_zext_pmuludq:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=329510&r1=329509&r2=329510&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Sat Apr 7 12:09:52 2018
@@ -472,7 +472,7 @@ define <8 x i32> @test9(%struct.ST* %bas
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
@@ -503,7 +503,7 @@ define <8 x i32> @test9(%struct.ST* %bas
; SKX_SMALL: # %bb.0: # %entry
; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
@@ -515,7 +515,7 @@ define <8 x i32> @test9(%struct.ST* %bas
; SKX_LARGE-LABEL: test9:
; SKX_LARGE: # %bb.0: # %entry
; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
-; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
@@ -558,7 +558,7 @@ define <8 x i32> @test10(%struct.ST* %ba
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
@@ -589,7 +589,7 @@ define <8 x i32> @test10(%struct.ST* %ba
; SKX_SMALL: # %bb.0: # %entry
; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
@@ -601,7 +601,7 @@ define <8 x i32> @test10(%struct.ST* %ba
; SKX_LARGE-LABEL: test10:
; SKX_LARGE: # %bb.0: # %entry
; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
-; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
Modified: llvm/trunk/test/CodeGen/X86/mulvi32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mulvi32.ll?rev=329510&r1=329509&r2=329510&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mulvi32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/mulvi32.ll Sat Apr 7 12:09:52 2018
@@ -180,15 +180,13 @@ define <4 x i64> @_mul4xi32toi64a(<4 x i
;
; AVX1-LABEL: _mul4xi32toi64a:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: _mul4xi32toi64a:
Modified: llvm/trunk/test/CodeGen/X86/pmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pmul.ll?rev=329510&r1=329509&r2=329510&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pmul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pmul.ll Sat Apr 7 12:09:52 2018
@@ -1435,7 +1435,7 @@ define <8 x i64> @mul_v8i64_sext(<8 x i1
; AVX512-LABEL: mul_v8i64_sext:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1
+; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
%1 = sext <8 x i16> %val1 to <8 x i64>
Modified: llvm/trunk/test/CodeGen/X86/xop-ifma.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/xop-ifma.ll?rev=329510&r1=329509&r2=329510&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/xop-ifma.ll (original)
+++ llvm/trunk/test/CodeGen/X86/xop-ifma.ll Sat Apr 7 12:09:52 2018
@@ -81,8 +81,8 @@ define <4 x i64> @test_mulx_v4i32_add_v4
;
; XOP-AVX2-LABEL: test_mulx_v4i32_add_v4i64:
; XOP-AVX2: # %bb.0:
-; XOP-AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
-; XOP-AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; XOP-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; XOP-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; XOP-AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
; XOP-AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; XOP-AVX2-NEXT: retq
More information about the llvm-commits
mailing list