[llvm] r350134 - [X86] Directly emit X86ISD::PMULUDQ from the ReplaceNodeResults handling of v2i8/v2i16/v2i32 multiply.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 28 11:19:39 PST 2018
Author: ctopper
Date: Fri Dec 28 11:19:39 2018
New Revision: 350134
URL: http://llvm.org/viewvc/llvm-project?rev=350134&view=rev
Log:
[X86] Directly emit X86ISD::PMULUDQ from the ReplaceNodeResults handling of v2i8/v2i16/v2i32 multiply.
Previously we emitted a multiply and some masking that was supposed to matched to PMULUDQ, but the masking could sometimes be removed before we got a chance to match it. So instead just emit the PMULUDQ directly.
Remove the DAG combine that was added when the ReplaceNodeResults code was originally added. Add a new DAG combine to avoid regressions in shrink_vmul.ll
Some of the shrink_vmul.ll test cases now pick PMULUDQ instead of PMADDWD/PMULLD, but I think this should be an improvement on most CPUs.
I think all of this can go away if/when we switch to -x86-experimental-vector-widening-legalization
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/shrink_vmul.ll
llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=350134&r1=350133&r2=350134&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Dec 28 11:19:39 2018
@@ -26246,13 +26246,9 @@ void X86TargetLowering::ReplaceNodeResul
// Promote to a pattern that will be turned into PMULUDQ.
SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
N->getOperand(0));
- N0 = DAG.getNode(ISD::AND, dl, MVT::v2i64, N0,
- DAG.getConstant(0xffffffff, dl, MVT::v2i64));
SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
N->getOperand(1));
- N1 = DAG.getNode(ISD::AND, dl, MVT::v2i64, N1,
- DAG.getConstant(0xffffffff, dl, MVT::v2i64));
- SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v2i64, N0, N1);
+ SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1);
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
} else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8) {
@@ -32250,6 +32246,52 @@ static SDValue combineShuffle(SDNode *N,
return SDValue(N, 0);
}
+ // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
+ // operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
+ // FIXME: This can probably go away once we default to widening legalization.
+ if (Subtarget.hasSSE41() && VT == MVT::v4i32 &&
+ N->getOpcode() == ISD::VECTOR_SHUFFLE &&
+ N->getOperand(0).getOpcode() == ISD::BITCAST &&
+ N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) {
+ SDValue BC = N->getOperand(0);
+ SDValue MULUDQ = BC.getOperand(0);
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ ArrayRef<int> Mask = SVOp->getMask();
+ if (BC.hasOneUse() && MULUDQ.hasOneUse() &&
+ Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) {
+ SDValue Op0 = MULUDQ.getOperand(0);
+ SDValue Op1 = MULUDQ.getOperand(1);
+ if (Op0.getOpcode() == ISD::BITCAST &&
+ Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
+ Op0.getOperand(0).getValueType() == MVT::v4i32) {
+ ShuffleVectorSDNode *SVOp0 =
+ cast<ShuffleVectorSDNode>(Op0.getOperand(0));
+ ArrayRef<int> Mask2 = SVOp0->getMask();
+ if (Mask2[0] == 0 && Mask2[1] == -1 &&
+ Mask2[2] == 1 && Mask2[3] == -1) {
+ Op0 = SVOp0->getOperand(0);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+ Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask);
+ return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
+ }
+ }
+ if (Op1.getOpcode() == ISD::BITCAST &&
+ Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
+ Op1.getOperand(0).getValueType() == MVT::v4i32) {
+ ShuffleVectorSDNode *SVOp1 =
+ cast<ShuffleVectorSDNode>(Op1.getOperand(0));
+ ArrayRef<int> Mask2 = SVOp1->getMask();
+ if (Mask2[0] == 0 && Mask2[1] == -1 &&
+ Mask2[2] == 1 && Mask2[3] == -1) {
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask);
+ Op1 = SVOp1->getOperand(0);
+ return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
+ }
+ }
+ }
+ }
+
return SDValue();
}
@@ -35107,26 +35149,6 @@ static SDValue combineMul(SDNode *N, Sel
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
- // Look for multiply of 2 identical shuffles with a zero vector. Shuffle the
- // result and insert the zero there instead. This can occur due to
- // type legalization of v2i32 multiply to a PMULUDQ pattern.
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
- if (!DCI.isBeforeLegalize() && isa<ShuffleVectorSDNode>(LHS) &&
- isa<ShuffleVectorSDNode>(RHS) && LHS.hasOneUse() && RHS.hasOneUse() &&
- LHS.getOperand(1) == RHS.getOperand(1) &&
- ISD::isBuildVectorAllZeros(LHS.getOperand(1).getNode())) {
- ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(LHS);
- ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);
- if (SVN0->getMask().equals(SVN1->getMask())) {
- SDLoc dl(N);
- SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, LHS.getOperand(0),
- RHS.getOperand(0));
- return DAG.getVectorShuffle(VT, dl, Mul, DAG.getConstant(0, dl, VT),
- SVN0->getMask());
- }
- }
-
if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
return V;
Modified: llvm/trunk/test/CodeGen/X86/shrink_vmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shrink_vmul.ll?rev=350134&r1=350133&r2=350134&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shrink_vmul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shrink_vmul.ll Fri Dec 28 11:19:39 2018
@@ -44,7 +44,7 @@ define void @mul_2xi8(i8* nocapture read
; X86-AVX-NEXT: movl c, %esi
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
@@ -70,7 +70,7 @@ define void @mul_2xi8(i8* nocapture read
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
@@ -916,7 +916,7 @@ define void @mul_2xi8_sext(i8* nocapture
; X86-AVX-NEXT: movl c, %esi
; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
@@ -944,7 +944,7 @@ define void @mul_2xi8_sext(i8* nocapture
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
@@ -1004,7 +1004,7 @@ define void @mul_2xi8_sext_zext(i8* noca
; X86-AVX-NEXT: movl c, %esi
; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
@@ -1033,7 +1033,7 @@ define void @mul_2xi8_sext_zext(i8* noca
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
@@ -1087,7 +1087,7 @@ define void @mul_2xi16_sext(i8* nocaptur
; X86-AVX-NEXT: movl c, %esi
; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
@@ -1110,7 +1110,7 @@ define void @mul_2xi16_sext(i8* nocaptur
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
@@ -1169,9 +1169,8 @@ define void @mul_2xi16_sext_zext(i8* noc
; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
@@ -1198,9 +1197,8 @@ define void @mul_2xi16_sext_zext(i8* noc
; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
@@ -1406,7 +1404,7 @@ define void @mul_2xi8_varconst1(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
@@ -1430,7 +1428,7 @@ define void @mul_2xi8_varconst1(i8* noca
; X64-AVX-NEXT: movl $255, %ecx
; X64-AVX-NEXT: vmovq %rcx, %xmm1
; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
@@ -1474,7 +1472,7 @@ define void @mul_2xi8_varconst2(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
@@ -1496,7 +1494,7 @@ define void @mul_2xi8_varconst2(i8* noca
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
@@ -1542,7 +1540,7 @@ define void @mul_2xi8_varconst3(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
@@ -1569,7 +1567,7 @@ define void @mul_2xi8_varconst3(i8* noca
; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100
; X64-AVX-NEXT: vmovq %rcx, %xmm1
; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
@@ -1615,7 +1613,7 @@ define void @mul_2xi8_varconst4(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
@@ -1639,7 +1637,7 @@ define void @mul_2xi8_varconst4(i8* noca
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
@@ -1685,7 +1683,7 @@ define void @mul_2xi8_varconst5(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
@@ -1709,7 +1707,7 @@ define void @mul_2xi8_varconst5(i8* noca
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
@@ -1755,7 +1753,7 @@ define void @mul_2xi8_varconst6(i8* noca
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
@@ -1779,7 +1777,7 @@ define void @mul_2xi8_varconst6(i8* noca
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
@@ -1823,9 +1821,7 @@ define void @mul_2xi16_varconst1(i8* noc
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1846,12 +1842,11 @@ define void @mul_2xi16_varconst1(i8* noc
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X64-AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF
; X64-AVX-NEXT: vmovq %rcx, %xmm1
; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1893,7 +1888,7 @@ define void @mul_2xi16_varconst2(i8* noc
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
@@ -1914,7 +1909,7 @@ define void @mul_2xi16_varconst2(i8* noc
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
@@ -1958,9 +1953,7 @@ define void @mul_2xi16_varconst3(i8* noc
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1984,12 +1977,11 @@ define void @mul_2xi16_varconst3(i8* noc
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X64-AVX-NEXT: movl $65536, %ecx # imm = 0x10000
; X64-AVX-NEXT: vmovq %rcx, %xmm1
; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -2031,7 +2023,7 @@ define void @mul_2xi16_varconst4(i8* noc
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
@@ -2058,7 +2050,7 @@ define void @mul_2xi16_varconst4(i8* noc
; X64-AVX-NEXT: movl $32768, %ecx # imm = 0x8000
; X64-AVX-NEXT: vmovq %rcx, %xmm1
; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll?rev=350134&r1=350133&r2=350134&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll Fri Dec 28 11:19:39 2018
@@ -790,35 +790,12 @@ define i32 @test_v2i32(<2 x i32> %a0) {
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
;
-; AVX512BW-LABEL: test_v2i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v2i32:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vmovd %xmm0, %eax
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v2i32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovd %xmm0, %eax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v2i32:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vmovd %xmm0, %eax
-; AVX512DQVL-NEXT: retq
+; AVX512-LABEL: test_v2i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> %a0)
ret i32 %1
}
@@ -1156,39 +1133,13 @@ define i16 @test_v2i16(<2 x i16> %a0) {
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
-; AVX512BW-LABEL: test_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v2i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vmovd %xmm0, %eax
-; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v2i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovd %xmm0, %eax
-; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v2i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vmovd %xmm0, %eax
-; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512DQVL-NEXT: retq
+; AVX512-LABEL: test_v2i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: retq
%1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> %a0)
ret i16 %1
}
@@ -1634,39 +1585,13 @@ define i8 @test_v2i8(<2 x i8> %a0) {
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
-; AVX512BW-LABEL: test_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v2i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v2i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512DQVL-NEXT: retq
+; AVX512-LABEL: test_v2i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: retq
%1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> %a0)
ret i8 %1
}
More information about the llvm-commits
mailing list