[llvm] r340690 - [X86] Replace support for vXi32 SMUL_LOHI/UMUL_LOHI with MULHS/MULHU support instead.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Aug 25 11:01:24 PDT 2018
Author: ctopper
Date: Sat Aug 25 11:01:24 2018
New Revision: 340690
URL: http://llvm.org/viewvc/llvm-project?rev=340690&view=rev
Log:
[X86] Replace support for vXi32 SMUL_LOHI/UMUL_LOHI with MULHS/MULHU support instead.
Summary:
The only time vector SMUL_LOHI/UMUL_LOHI nodes are created is during division/remainder lowering. If its created before op legalization, generic DAGCombine immediately turns that SMUL_LOHI/UMUL_LOHI into a MULHS/MULHU since only the upper half is used. That node will stick around through vector op legalization and will be turned back into UMUL_LOHI/SMUL_LOHI during op legalization. It will then be custom lowered by the X86 backend. Due to this two step lowering the vector shuffles created by the custom lowering get legalized after their inputs rather than before. This prevents the shuffles from being combined with any build_vector of constants.
This patch uses changes vXi32 to use MULHS/MULHU instead. This is what the later DAG combine did anyway. But by skipping the change back to UMUL_LOHI/SMUL_LOHI we lower it before any constant BUILD_VECTORS. This allows the vector_shuffle creation to constant fold with the build_vectors. This accounts for the test changes here.
Reviewers: RKSimon, spatel
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D51254
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-128.ll
llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-256.ll
llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-512.ll
llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-128.ll
llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-256.ll
llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-512.ll
llvm/trunk/test/CodeGen/X86/vector-idiv.ll
llvm/trunk/test/CodeGen/X86/vselect-avx.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=340690&r1=340689&r2=340690&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sat Aug 25 11:01:24 2018
@@ -782,8 +782,8 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
- setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
- setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
@@ -1087,9 +1087,8 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v32i8, Custom);
- setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
- setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
-
+ setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
@@ -1331,8 +1330,8 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::MUL, MVT::v8i64, Custom);
setOperationAction(ISD::MUL, MVT::v16i32, Legal);
- setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
- setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
@@ -22901,6 +22900,75 @@ static SDValue LowerMULH(SDValue Op, con
if (VT.is256BitVector() && !Subtarget.hasInt256())
return Lower256IntArith(Op, DAG);
+ if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
+ assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
+ (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
+ (VT == MVT::v16i32 && Subtarget.hasAVX512()));
+ SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+
+ int NumElts = VT.getVectorNumElements();
+
+ // PMULxD operations multiply each even value (starting at 0) of LHS with
+ // the related value of RHS and produce a widen result.
+ // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+ // => <2 x i64> <ae|cg>
+ //
+ // In other word, to have all the results, we need to perform two PMULxD:
+ // 1. one with the even values.
+ // 2. one with the odd values.
+ // To achieve #2, with need to place the odd values at an even position.
+ //
+ // Place the odd value at an even position (basically, shift all values 1
+ // step to the left):
+ const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
+ 9, -1, 11, -1, 13, -1, 15, -1};
+ // <a|b|c|d> => <b|undef|d|undef>
+ SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
+ makeArrayRef(&Mask[0], NumElts));
+ // <e|f|g|h> => <f|undef|h|undef>
+ SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
+ makeArrayRef(&Mask[0], NumElts));
+
+ // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+ // ints.
+ MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
+ bool IsSigned = Op->getOpcode() == ISD::MULHS;
+ unsigned Opcode =
+ (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+ // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+ // => <2 x i64> <ae|cg>
+ SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
+ DAG.getBitcast(MulVT, Op0),
+ DAG.getBitcast(MulVT, Op1)));
+ // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
+ // => <2 x i64> <bf|dh>
+ SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
+ DAG.getBitcast(MulVT, Odd0),
+ DAG.getBitcast(MulVT, Odd1)));
+
+ // Shuffle it back into the right order.
+ SmallVector<int, 16> ShufMask(NumElts);
+ for (int i = 0; i != NumElts; ++i)
+ ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
+
+ SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
+
+ // If we have a signed multiply but no PMULDQ fix up the result of an
+ // unsigned multiply.
+ if (IsSigned && !Subtarget.hasSSE41()) {
+ SDValue ShAmt = DAG.getConstant(31, dl, VT);
+ SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+ SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+ SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+ Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
+ }
+
+ return Res;
+ }
+
// Only i8 vectors should need custom lowering after this.
assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
@@ -23084,105 +23152,6 @@ SDValue X86TargetLowering::LowerWin64_i1
return DAG.getBitcast(VT, CallInfo.first);
}
-static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
- MVT VT = Op0.getSimpleValueType();
- SDLoc dl(Op);
-
- // Decompose 256-bit ops into smaller 128-bit ops.
- if (VT.is256BitVector() && !Subtarget.hasInt256()) {
- unsigned Opcode = Op.getOpcode();
- unsigned NumElems = VT.getVectorNumElements();
- MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
- SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
- SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
- SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
- SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
- SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
- SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
- SDValue Ops[] = {
- DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
- DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
- };
- return DAG.getMergeValues(Ops, dl);
- }
-
- assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
- (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
- (VT == MVT::v16i32 && Subtarget.hasAVX512()));
-
- int NumElts = VT.getVectorNumElements();
-
- // PMULxD operations multiply each even value (starting at 0) of LHS with
- // the related value of RHS and produce a widen result.
- // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
- // => <2 x i64> <ae|cg>
- //
- // In other word, to have all the results, we need to perform two PMULxD:
- // 1. one with the even values.
- // 2. one with the odd values.
- // To achieve #2, with need to place the odd values at an even position.
- //
- // Place the odd value at an even position (basically, shift all values 1
- // step to the left):
- const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
- // <a|b|c|d> => <b|undef|d|undef>
- SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
- makeArrayRef(&Mask[0], NumElts));
- // <e|f|g|h> => <f|undef|h|undef>
- SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
- makeArrayRef(&Mask[0], NumElts));
-
- // Emit two multiplies, one for the lower 2 ints and one for the higher 2
- // ints.
- MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
- bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
- unsigned Opcode =
- (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
- // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
- // => <2 x i64> <ae|cg>
- SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
- DAG.getBitcast(MulVT, Op0),
- DAG.getBitcast(MulVT, Op1)));
- // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
- // => <2 x i64> <bf|dh>
- SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
- DAG.getBitcast(MulVT, Odd0),
- DAG.getBitcast(MulVT, Odd1)));
-
- // Shuffle it back into the right order.
- SmallVector<int, 16> HighMask(NumElts);
- SmallVector<int, 16> LowMask(NumElts);
- for (int i = 0; i != NumElts; ++i) {
- HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
- LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
- }
-
- SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
- SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
-
- // If we have a signed multiply but no PMULDQ fix up the high parts of a
- // unsigned multiply.
- if (IsSigned && !Subtarget.hasSSE41()) {
- SDValue ShAmt = DAG.getConstant(
- 31, dl,
- DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
- SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
- DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
- SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
- DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
-
- SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
- Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
- }
-
- // The first result of MUL_LOHI is actually the low value, followed by the
- // high value.
- SDValue Ops[] = {Lows, Highs};
- return DAG.getMergeValues(Ops, dl);
-}
-
// Return true if the required (according to Opcode) shift-imm form is natively
// supported by the Subtarget
static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
@@ -25579,8 +25548,6 @@ SDValue X86TargetLowering::LowerOperatio
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
case ISD::MULHS:
case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
- case ISD::UMUL_LOHI:
- case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
case ISD::ROTL:
case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
case ISD::SRA:
Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-128.ll?rev=340690&r1=340689&r2=340690&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-128.ll Sat Aug 25 11:01:24 2018
@@ -78,22 +78,19 @@ define <2 x i64> @test_div7_2i64(<2 x i6
define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_div7_4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm3
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: paddd %xmm0, %xmm3
+; SSE2-NEXT: psubd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrld $31, %xmm0
@@ -134,13 +131,12 @@ define <4 x i32> @test_div7_4i32(<4 x i3
;
; AVX2-LABEL: test_div7_4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1
; AVX2-NEXT: vpsrad $2, %xmm0, %xmm0
@@ -384,33 +380,30 @@ define <4 x i32> @test_rem7_4i32(<4 x i3
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: psubd %xmm2, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrld $31, %xmm2
-; SSE2-NEXT: psrad $2, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: paddd %xmm0, %xmm3
+; SSE2-NEXT: psubd %xmm3, %xmm2
+; SSE2-NEXT: paddd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: psrld $31, %xmm1
+; SSE2-NEXT: psrad $2, %xmm2
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [7,7,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: psubd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_4i32:
@@ -448,13 +441,12 @@ define <4 x i32> @test_rem7_4i32(<4 x i3
;
; AVX2-LABEL: test_rem7_4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1
Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-256.ll?rev=340690&r1=340689&r2=340690&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-256.ll Sat Aug 25 11:01:24 2018
@@ -88,41 +88,37 @@ define <4 x i64> @test_div7_4i64(<4 x i6
define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_div7_8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
-; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm2
; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmuldq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrld $31, %ymm0, %ymm1
; AVX2-NEXT: vpsrad $2, %ymm0, %ymm0
@@ -363,46 +359,42 @@ define <4 x i64> @test_rem7_4i64(<4 x i6
define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_rem7_8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $31, %xmm2, %xmm4
+; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7]
+; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2
; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
-; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuldq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $31, %xmm1, %xmm4
-; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmuldq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2
; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1
Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-512.ll?rev=340690&r1=340689&r2=340690&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-sdiv-512.ll Sat Aug 25 11:01:24 2018
@@ -86,7 +86,6 @@ define <16 x i32> @test_div7_16i32(<16 x
; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2
-; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1
; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
@@ -313,7 +312,6 @@ define <16 x i32> @test_rem7_16i32(<16 x
; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2
-; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1
; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-128.ll?rev=340690&r1=340689&r2=340690&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-128.ll Sat Aug 25 11:01:24 2018
@@ -128,13 +128,12 @@ define <4 x i32> @test_div7_4i32(<4 x i3
;
; AVX2-LABEL: test_div7_4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -422,13 +421,12 @@ define <4 x i32> @test_rem7_4i32(<4 x i3
;
; AVX2-LABEL: test_rem7_4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2
; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-256.ll?rev=340690&r1=340689&r2=340690&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-256.ll Sat Aug 25 11:01:24 2018
@@ -96,41 +96,37 @@ define <4 x i64> @test_div7_4i64(<4 x i6
define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_div7_8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
@@ -371,46 +367,42 @@ define <4 x i64> @test_rem7_4i64(<4 x i6
define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_rem7_8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm4
+; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7]
+; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
-; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-512.ll?rev=340690&r1=340689&r2=340690&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-udiv-512.ll Sat Aug 25 11:01:24 2018
@@ -94,7 +94,6 @@ define <16 x i32> @test_div7_16i32(<16 x
; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
-; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1
; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
@@ -324,7 +323,6 @@ define <16 x i32> @test_rem7_16i32(<16 x
; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
-; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1
; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
Modified: llvm/trunk/test/CodeGen/X86/vector-idiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv.ll?rev=340690&r1=340689&r2=340690&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv.ll Sat Aug 25 11:01:24 2018
@@ -24,24 +24,19 @@ define <4 x i32> @PR20355(<4 x i32> %a)
; SSE2-LABEL: PR20355:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: paddd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE2-NEXT: psubd %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: psrld $31, %xmm0
-; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: PR20355:
@@ -71,13 +66,12 @@ define <4 x i32> @PR20355(<4 x i32> %a)
;
; AVX2-LABEL: PR20355:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vselect-avx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vselect-avx.ll?rev=340690&r1=340689&r2=340690&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vselect-avx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vselect-avx.ll Sat Aug 25 11:01:24 2018
@@ -106,13 +106,12 @@ define void @test3(<4 x i32> %induction3
;
; AVX2-LABEL: test3:
; AVX2: ## %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuldq %xmm4, %xmm5, %xmm4
-; AVX2-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766]
+; AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpmuldq %xmm4, %xmm0, %xmm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
; AVX2-NEXT: vpsrld $31, %xmm3, %xmm4
; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,3,3,3]
More information about the llvm-commits
mailing list