[llvm] r289426 - [X86][SSE] Lower suitably sign-extended mul vXi64 using PMULDQ
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 12 02:49:16 PST 2016
Author: rksimon
Date: Mon Dec 12 04:49:15 2016
New Revision: 289426
URL: http://llvm.org/viewvc/llvm-project?rev=289426&view=rev
Log:
[X86][SSE] Lower suitably sign-extended mul vXi64 using PMULDQ
PMULDQ returns the 64-bit result of the signed multiplication of the lower 32-bits of vXi64 vector inputs, we can lower with this if the sign bits stretch that far.
Differential Revision: https://reviews.llvm.org/D27657
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
llvm/trunk/test/CodeGen/X86/pmul.ll
llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=289426&r1=289425&r2=289426&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Dec 12 04:49:15 2016
@@ -20510,6 +20510,17 @@ static SDValue LowerMUL(SDValue Op, cons
assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply");
+ // 32-bit vector types used for MULDQ/MULUDQ.
+ MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
+
+ // MULDQ returns the 64-bit result of the signed multiplication of the lower
+ // 32-bits. We can lower with this if the sign bits stretch that far.
+ if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
+ DAG.ComputeNumSignBits(B) > 32) {
+ return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
+ DAG.getBitcast(MulVT, B));
+ }
+
// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);
//
@@ -20528,9 +20539,7 @@ static SDValue LowerMUL(SDValue Op, cons
bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
- // Bit cast to 32-bit vectors for MULUDQ
- MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
- (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
+ // Bit cast to 32-bit vectors for MULUDQ.
SDValue Alo = DAG.getBitcast(MulVT, A);
SDValue Blo = DAG.getBitcast(MulVT, B);
@@ -25730,11 +25739,19 @@ void X86TargetLowering::computeKnownBits
}
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
- SDValue Op, const SelectionDAG &, unsigned Depth) const {
+ SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
if (Op.getOpcode() == X86ISD::SETCC_CARRY)
return Op.getScalarValueSizeInBits();
+ if (Op.getOpcode() == X86ISD::VSEXT) {
+ EVT VT = Op.getValueType();
+ EVT SrcVT = Op.getOperand(0).getValueType();
+ unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
+ return Tmp;
+ }
+
// Fallback case.
return 1;
}
Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=289426&r1=289425&r2=289426&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Mon Dec 12 04:49:15 2016
@@ -457,19 +457,14 @@ define <8 x i32> @test9(%struct.ST* %bas
; KNL_64-LABEL: test9:
; KNL_64: # BB#0: # %entry
; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
-; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
-; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
-; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
-; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
-; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
-; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
@@ -531,19 +526,14 @@ define <8 x i32> @test10(%struct.ST* %ba
; KNL_64-LABEL: test10:
; KNL_64: # BB#0: # %entry
; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
-; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
-; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
-; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
-; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
-; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
-; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
+; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
@@ -2077,15 +2067,6 @@ define void @test_scatter_16f64(<16 x do
declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
-; SKX-LABEL: test_pr28312:
-; SKX: # BB#0:
-; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
-; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
-; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
-; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; SKX-NEXT: retq
-;
; KNL_64-LABEL: test_pr28312:
; KNL_64: # BB#0:
; KNL_64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
@@ -2100,6 +2081,15 @@ define <4 x i64> @test_pr28312(<4 x i64*
; KNL_64-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; KNL_64-NEXT: retq
+;
+; SKX-LABEL: test_pr28312:
+; SKX: # BB#0:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm1
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
+; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
+; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; SKX-NEXT: retq
%g1 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g2 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g3 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
Modified: llvm/trunk/test/CodeGen/X86/pmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pmul.ll?rev=289426&r1=289425&r2=289426&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pmul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pmul.ll Mon Dec 12 04:49:15 2016
@@ -1546,65 +1546,24 @@ define <8 x i64> @mul_v8i64_sext(<8 x i1
;
; SSE41-LABEL: mul_v8i64_sext:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE41-NEXT: pmovsxwq %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovsxwq %xmm1, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxwq %xmm2, %xmm2
-; SSE41-NEXT: pmovsxwq %xmm0, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
-; SSE41-NEXT: pmovsxdq %xmm6, %xmm8
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,0,1]
-; SSE41-NEXT: pmovsxdq %xmm7, %xmm9
-; SSE41-NEXT: pmovsxdq %xmm4, %xmm4
-; SSE41-NEXT: pmovsxdq %xmm5, %xmm7
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pmuludq %xmm7, %xmm6
-; SSE41-NEXT: movdqa %xmm7, %xmm5
-; SSE41-NEXT: psrlq $32, %xmm5
-; SSE41-NEXT: pmuludq %xmm0, %xmm5
-; SSE41-NEXT: psllq $32, %xmm5
-; SSE41-NEXT: psrlq $32, %xmm0
-; SSE41-NEXT: pmuludq %xmm7, %xmm0
-; SSE41-NEXT: psllq $32, %xmm0
-; SSE41-NEXT: paddq %xmm5, %xmm0
-; SSE41-NEXT: paddq %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: pmuludq %xmm4, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: psrlq $32, %xmm6
-; SSE41-NEXT: pmuludq %xmm2, %xmm6
-; SSE41-NEXT: psllq $32, %xmm6
-; SSE41-NEXT: psrlq $32, %xmm2
-; SSE41-NEXT: pmuludq %xmm4, %xmm2
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: paddq %xmm6, %xmm2
-; SSE41-NEXT: paddq %xmm5, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: pmuludq %xmm9, %xmm4
-; SSE41-NEXT: movdqa %xmm9, %xmm5
-; SSE41-NEXT: psrlq $32, %xmm5
-; SSE41-NEXT: pmuludq %xmm1, %xmm5
-; SSE41-NEXT: psllq $32, %xmm5
-; SSE41-NEXT: psrlq $32, %xmm1
-; SSE41-NEXT: pmuludq %xmm9, %xmm1
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: paddq %xmm5, %xmm1
-; SSE41-NEXT: paddq %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pmuludq %xmm8, %xmm4
-; SSE41-NEXT: movdqa %xmm8, %xmm5
-; SSE41-NEXT: psrlq $32, %xmm5
-; SSE41-NEXT: pmuludq %xmm3, %xmm5
-; SSE41-NEXT: psllq $32, %xmm5
-; SSE41-NEXT: psrlq $32, %xmm3
-; SSE41-NEXT: pmuludq %xmm8, %xmm3
-; SSE41-NEXT: psllq $32, %xmm3
-; SSE41-NEXT: paddq %xmm5, %xmm3
-; SSE41-NEXT: paddq %xmm4, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
+; SSE41-NEXT: pmovsxwq %xmm3, %xmm8
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxwq %xmm3, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovsxwq %xmm3, %xmm7
+; SSE41-NEXT: pmovsxwq %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE41-NEXT: pmovsxdq %xmm0, %xmm3
+; SSE41-NEXT: pmovsxdq %xmm2, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovsxdq %xmm0, %xmm4
+; SSE41-NEXT: pmovsxdq %xmm1, %xmm0
+; SSE41-NEXT: pmuldq %xmm5, %xmm0
+; SSE41-NEXT: pmuldq %xmm7, %xmm4
+; SSE41-NEXT: pmuldq %xmm6, %xmm2
+; SSE41-NEXT: pmuldq %xmm8, %xmm3
+; SSE41-NEXT: movdqa %xmm4, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v8i64_sext:
@@ -1615,39 +1574,15 @@ define <8 x i64> @mul_v8i64_sext(<8 x i1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm4
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm5
-; AVX2-NEXT: vpmuludq %ymm5, %ymm0, %ymm5
-; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
-; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
-; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: mul_v8i64_sext:
; AVX512: # BB#0:
; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1
-; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
-; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3
-; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
-; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
-; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0
-; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
%1 = sext <8 x i16> %val1 to <8 x i64>
%2 = sext <8 x i32> %val2 to <8 x i64>
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll?rev=289426&r1=289425&r2=289426&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll Mon Dec 12 04:49:15 2016
@@ -5200,24 +5200,8 @@ define <4 x i32> @mul_add_v4i64_v4i32(<4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
-; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
-; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm1
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -5229,15 +5213,7 @@ define <4 x i32> @mul_add_v4i64_v4i32(<4
; AVX2: # BB#0:
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
@@ -5249,15 +5225,7 @@ define <4 x i32> @mul_add_v4i64_v4i32(<4
; AVX512: # BB#0:
; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX512-NEXT: vpmovsxdq %xmm1, %ymm1
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3
-; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
-; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
More information about the llvm-commits
mailing list