[llvm] r347502 - [DAG] consolidate shift simplifications
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 23 12:05:12 PST 2018
Author: spatel
Date: Fri Nov 23 12:05:12 2018
New Revision: 347502
URL: http://llvm.org/viewvc/llvm-project?rev=347502&view=rev
Log:
[DAG] consolidate shift simplifications
...and use them to avoid creating obviously undef values as
discussed in the post-commit thread for r347478.
The diffs in vector div/rem show that we were missing real
optimizations by creating bogus shift nodes.
Modified:
llvm/trunk/include/llvm/CodeGen/SelectionDAG.h
llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/trunk/test/CodeGen/X86/bt.ll
llvm/trunk/test/CodeGen/X86/combine-sdiv.ll
llvm/trunk/test/CodeGen/X86/combine-srem.ll
llvm/trunk/test/CodeGen/X86/scheduler-backtracking.ll
llvm/trunk/test/CodeGen/X86/shift-folding.ll
Modified: llvm/trunk/include/llvm/CodeGen/SelectionDAG.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/SelectionDAG.h?rev=347502&r1=347501&r2=347502&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/SelectionDAG.h (original)
+++ llvm/trunk/include/llvm/CodeGen/SelectionDAG.h Fri Nov 23 12:05:12 2018
@@ -965,6 +965,9 @@ public:
/// Try to simplify a select/vselect into 1 of its operands or a constant.
SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal);
+ /// Try to simplify a shift into 1 of its operands or a constant.
+ SDValue simplifyShift(SDValue X, SDValue Y);
+
/// VAArg produces a result and token chain, and takes a pointer
/// and a source value as input.
SDValue getVAArg(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
Modified: llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h?rev=347502&r1=347501&r2=347502&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h (original)
+++ llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h Fri Nov 23 12:05:12 2018
@@ -1613,6 +1613,11 @@ ConstantSDNode *isConstOrConstSplat(SDVa
/// Returns the SDNode if it is a constant splat BuildVector or constant float.
ConstantFPSDNode *isConstOrConstSplatFP(SDValue N, bool AllowUndefs = false);
+/// Determines if it is a constant null integer or a splatted vector of a
+/// constant null integer (with no undefs).
+/// Build vector implicit truncation is not an issue for null values.
+bool isNullOrNullSplat(SDValue V);
+
class GlobalAddressSDNode : public SDNode {
friend class SelectionDAG;
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=347502&r1=347501&r2=347502&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Fri Nov 23 12:05:12 2018
@@ -911,16 +911,6 @@ static bool isConstantOrConstantVector(S
return true;
}
-// Determines if it is a constant null integer or a splatted vector of a
-// constant null integer (with no undefs).
-// Build vector implicit truncation is not an issue for null values.
-static bool isNullConstantOrNullSplatConstant(SDValue N) {
- // TODO: may want to use peekThroughBitcast() here.
- if (ConstantSDNode *Splat = isConstOrConstSplat(N))
- return Splat->isNullValue();
- return false;
-}
-
// Determines if it is a constant integer of one or a splatted vector of a
// constant integer of one (with no undefs).
// Do not permit build vector implicit truncation.
@@ -1922,9 +1912,9 @@ SDValue DAGCombiner::foldBinOpIntoSelect
// or X, (select Cond, -1, 0) --> select Cond, -1, X
auto BinOpcode = BO->getOpcode();
bool CanFoldNonConst = (BinOpcode == ISD::AND || BinOpcode == ISD::OR) &&
- (isNullConstantOrNullSplatConstant(CT) ||
+ (isNullOrNullSplat(CT) ||
isAllOnesConstantOrAllOnesSplatConstant(CT)) &&
- (isNullConstantOrNullSplatConstant(CF) ||
+ (isNullOrNullSplat(CF) ||
isAllOnesConstantOrAllOnesSplatConstant(CF));
SDValue CBO = BO->getOperand(SelOpNo ^ 1);
@@ -2123,13 +2113,11 @@ SDValue DAGCombiner::visitADD(SDNode *N)
return RADD;
// fold ((0-A) + B) -> B-A
- if (N0.getOpcode() == ISD::SUB &&
- isNullConstantOrNullSplatConstant(N0.getOperand(0)))
+ if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0)))
return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
// fold (A + (0-B)) -> A-B
- if (N1.getOpcode() == ISD::SUB &&
- isNullConstantOrNullSplatConstant(N1.getOperand(0)))
+ if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1));
// fold (A+(B-A)) -> B
@@ -2244,7 +2232,7 @@ SDValue DAGCombiner::visitADDLike(SDValu
// fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
- isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0)))
+ isNullOrNullSplat(N1.getOperand(0).getOperand(0)))
return DAG.getNode(ISD::SUB, DL, VT, N0,
DAG.getNode(ISD::SHL, DL, VT,
N1.getOperand(0).getOperand(1),
@@ -2594,7 +2582,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N)
DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
}
- if (isNullConstantOrNullSplatConstant(N0)) {
+ if (isNullOrNullSplat(N0)) {
unsigned BitWidth = VT.getScalarSizeInBits();
// Right-shifting everything out but the sign bit followed by negation is
// the same as flipping arithmetic/logical shift type without the negation:
@@ -2629,8 +2617,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N)
return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
// fold (A - (0-B)) -> A+B
- if (N1.getOpcode() == ISD::SUB &&
- isNullConstantOrNullSplatConstant(N1.getOperand(0)))
+ if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1));
// fold A-(A-B) -> B
@@ -2684,14 +2671,14 @@ SDValue DAGCombiner::visitSUB(SDNode *N)
// fold (X - (-Y * Z)) -> (X + (Y * Z))
if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) {
if (N1.getOperand(0).getOpcode() == ISD::SUB &&
- isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) {
+ isNullOrNullSplat(N1.getOperand(0).getOperand(0))) {
SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
N1.getOperand(0).getOperand(1),
N1.getOperand(1));
return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
}
if (N1.getOperand(1).getOpcode() == ISD::SUB &&
- isNullConstantOrNullSplatConstant(N1.getOperand(1).getOperand(0))) {
+ isNullOrNullSplat(N1.getOperand(1).getOperand(0))) {
SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
N1.getOperand(0),
N1.getOperand(1).getOperand(1));
@@ -3911,7 +3898,7 @@ SDValue DAGCombiner::foldLogicOfSetCCs(b
ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
bool IsInteger = OpVT.isInteger();
if (LR == RR && CC0 == CC1 && IsInteger) {
- bool IsZero = isNullConstantOrNullSplatConstant(LR);
+ bool IsZero = isNullOrNullSplat(LR);
bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR);
// All bits clear?
@@ -4661,7 +4648,7 @@ SDValue DAGCombiner::visitAND(SDNode *N)
// Note: the SimplifyDemandedBits fold below can make an information-losing
// transform, and then we have no way to find this better fold.
if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
- if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) {
+ if (isNullOrNullSplat(N0.getOperand(0))) {
SDValue SubRHS = N0.getOperand(1);
if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
@@ -6322,7 +6309,7 @@ SDValue DAGCombiner::visitRotate(SDNode
unsigned Bitsize = VT.getScalarSizeInBits();
// fold (rot x, 0) -> x
- if (isNullConstantOrNullSplatConstant(N1))
+ if (isNullOrNullSplat(N1))
return N0;
// fold (rot x, c) -> (rot x, c % BitSize)
@@ -6367,6 +6354,9 @@ SDValue DAGCombiner::visitRotate(SDNode
SDValue DAGCombiner::visitSHL(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ if (SDValue V = DAG.simplifyShift(N0, N1))
+ return V;
+
EVT VT = N0.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
@@ -6401,22 +6391,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N)
ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
if (N0C && N1C && !N1C->isOpaque())
return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);
- // fold (shl 0, x) -> 0
- if (isNullConstantOrNullSplatConstant(N0))
- return N0;
- // fold (shl x, c >= size(x)) -> undef
- // NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
- auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
- return Val->getAPIntValue().uge(OpSizeInBits);
- };
- if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig))
- return DAG.getUNDEF(VT);
- // fold (shl x, 0) -> x
- if (N1C && N1C->isNullValue())
- return N0;
- // fold (shl undef, x) -> 0
- if (N0.isUndef())
- return DAG.getConstant(0, SDLoc(N), VT);
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
@@ -6606,6 +6580,9 @@ SDValue DAGCombiner::visitSHL(SDNode *N)
SDValue DAGCombiner::visitSRA(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ if (SDValue V = DAG.simplifyShift(N0, N1))
+ return V;
+
EVT VT = N0.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
@@ -6626,16 +6603,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N)
ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
if (N0C && N1C && !N1C->isOpaque())
return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
- // fold (sra x, c >= size(x)) -> undef
- // NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
- auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
- return Val->getAPIntValue().uge(OpSizeInBits);
- };
- if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig))
- return DAG.getUNDEF(VT);
- // fold (sra x, 0) -> x
- if (N1C && N1C->isNullValue())
- return N0;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
@@ -6772,6 +6739,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N)
SDValue DAGCombiner::visitSRL(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ if (SDValue V = DAG.simplifyShift(N0, N1))
+ return V;
+
EVT VT = N0.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
@@ -6786,19 +6756,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N)
ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
if (N0C && N1C && !N1C->isOpaque())
return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);
- // fold (srl 0, x) -> 0
- if (isNullConstantOrNullSplatConstant(N0))
- return N0;
- // fold (srl x, c >= size(x)) -> undef
- // NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
- auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
- return Val->getAPIntValue().uge(OpSizeInBits);
- };
- if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig))
- return DAG.getUNDEF(VT);
- // fold (srl x, 0) -> x
- if (N1C && N1C->isNullValue())
- return N0;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
@@ -7890,7 +7847,7 @@ SDValue DAGCombiner::visitVSELECT(SDNode
// TODO: This should be extended to handle any constant.
// TODO: This could be extended to handle non-loading patterns, but that
// requires thorough testing to avoid regressions.
- if (isNullConstantOrNullSplatConstant(RHS)) {
+ if (isNullOrNullSplat(RHS)) {
EVT NarrowVT = LHS.getValueType();
EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger();
EVT SetCCVT = getSetCCResultType(LHS.getValueType());
@@ -8710,9 +8667,9 @@ static bool isTruncateOf(SelectionDAG &D
SDValue Op1 = N->getOperand(1);
assert(Op0.getValueType() == Op1.getValueType());
- if (isNullConstantOrNullSplatConstant(Op0))
+ if (isNullOrNullSplat(Op0))
Op = Op1;
- else if (isNullConstantOrNullSplatConstant(Op1))
+ else if (isNullOrNullSplat(Op1))
Op = Op0;
else
return false;
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp?rev=347502&r1=347501&r2=347502&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp Fri Nov 23 12:05:12 2018
@@ -4648,6 +4648,9 @@ SDValue SelectionDAG::getNode(unsigned O
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
+ if (SDValue V = simplifyShift(N1, N2))
+ return V;
+ LLVM_FALLTHROUGH;
case ISD::ROTL:
case ISD::ROTR:
assert(VT == N1.getValueType() &&
@@ -4656,7 +4659,7 @@ SDValue SelectionDAG::getNode(unsigned O
"Shifts only work on integers");
assert((!VT.isVector() || VT == N2.getValueType()) &&
"Vector shift amounts must be in the same as their first arg");
- // Verify that the shift amount VT is bit enough to hold valid shift
+ // Verify that the shift amount VT is big enough to hold valid shift
// amounts. This catches things like trying to shift an i1024 value by an
// i8, which is easy to fall into in generic code that uses
// TLI.getShiftAmount().
@@ -4968,9 +4971,6 @@ SDValue SelectionDAG::getNode(unsigned O
case ISD::SDIV:
case ISD::UREM:
case ISD::SREM:
- case ISD::SRA:
- case ISD::SRL:
- case ISD::SHL:
return getConstant(0, DL, VT); // fold op(undef, arg2) -> 0
}
}
@@ -4993,9 +4993,6 @@ SDValue SelectionDAG::getNode(unsigned O
case ISD::SDIV:
case ISD::UREM:
case ISD::SREM:
- case ISD::SRA:
- case ISD::SRL:
- case ISD::SHL:
return getUNDEF(VT); // fold op(arg1, undef) -> undef
case ISD::MUL:
case ISD::AND:
@@ -6823,6 +6820,30 @@ SDValue SelectionDAG::simplifySelect(SDV
return SDValue();
}
+SDValue SelectionDAG::simplifyShift(SDValue X, SDValue Y) {
+ // shift undef, Y --> 0 (can always assume that the undef value is 0)
+ if (X.isUndef())
+ return getConstant(0, SDLoc(X.getNode()), X.getValueType());
+ // shift X, undef --> undef (because it may shift by the bitwidth)
+ if (Y.isUndef())
+ return getUNDEF(X.getValueType());
+
+ // shift 0, Y --> 0
+ // shift X, 0 --> X
+ if (isNullOrNullSplat(X) || isNullOrNullSplat(Y))
+ return X;
+
+ // shift X, C >= bitwidth(X) --> undef
+ // All vector elements must be too big to avoid partial undefs.
+ auto isShiftTooBig = [X](ConstantSDNode *Val) {
+ return Val->getAPIntValue().uge(X.getScalarValueSizeInBits());
+ };
+ if (ISD::matchUnaryPredicate(Y, isShiftTooBig))
+ return getUNDEF(X.getValueType());
+
+ return SDValue();
+}
+
SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain,
SDValue Ptr, SDValue SV, unsigned Align) {
SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) };
@@ -8358,6 +8379,12 @@ ConstantFPSDNode *llvm::isConstOrConstSp
return nullptr;
}
+bool llvm::isNullOrNullSplat(SDValue N) {
+ // TODO: may want to use peekThroughBitcast() here.
+ ConstantSDNode *Splat = isConstOrConstSplat(N);
+ return Splat && Splat->isNullValue();
+}
+
HandleSDNode::~HandleSDNode() {
DropOperands();
}
Modified: llvm/trunk/test/CodeGen/X86/bt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bt.ll?rev=347502&r1=347501&r2=347502&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bt.ll Fri Nov 23 12:05:12 2018
@@ -1067,7 +1067,7 @@ define zeroext i1 @extend(i32 %bit, i64
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: btl %eax, %ecx
+; X86-NEXT: btl %ecx, %eax
; X86-NEXT: setb %al
; X86-NEXT: retl
;
Modified: llvm/trunk/test/CodeGen/X86/combine-sdiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-sdiv.ll?rev=347502&r1=347501&r2=347502&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-sdiv.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-sdiv.ll Fri Nov 23 12:05:12 2018
@@ -1015,8 +1015,7 @@ define <4 x i32> @combine_vec_sdiv_by_po
; SSE2-NEXT: psrld $29, %xmm3
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
; SSE2-NEXT: psrld $30, %xmm1
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrad $4, %xmm2
@@ -1039,18 +1038,17 @@ define <4 x i32> @combine_vec_sdiv_by_po
; SSE41-NEXT: psrld $30, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: psrld $29, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: paddd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrad $4, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrad $4, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: psrad $2, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: psrad $3, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
@@ -1060,8 +1058,6 @@ define <4 x i32> @combine_vec_sdiv_by_po
; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
@@ -1104,8 +1100,7 @@ define <8 x i32> @combine_vec_sdiv_by_po
; SSE2-NEXT: psrld $29, %xmm4
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
; SSE2-NEXT: psrld $30, %xmm0
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrad $4, %xmm3
@@ -1123,8 +1118,7 @@ define <8 x i32> @combine_vec_sdiv_by_po
; SSE2-NEXT: psrld $29, %xmm4
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
; SSE2-NEXT: psrld $30, %xmm2
-; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: psrad $4, %xmm3
@@ -1147,28 +1141,25 @@ define <8 x i32> @combine_vec_sdiv_by_po
; SSE41-NEXT: psrld $30, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; SSE41-NEXT: psrld $29, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
; SSE41-NEXT: paddd %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrad $4, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: psrad $4, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrad $2, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT: psrad $2, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; SSE41-NEXT: psrad $3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrld $28, %xmm3
; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psrld $28, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: psrld $30, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT: psrld $30, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; SSE41-NEXT: psrld $29, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
; SSE41-NEXT: paddd %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: psrad $4, %xmm3
@@ -1189,8 +1180,6 @@ define <8 x i32> @combine_vec_sdiv_by_po
; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
@@ -1201,10 +1190,9 @@ define <8 x i32> @combine_vec_sdiv_by_po
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3
-; AVX1-NEXT: vpsrld $30, %xmm2, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
@@ -1258,8 +1246,7 @@ define <16 x i32> @combine_vec_sdiv_by_p
; SSE2-NEXT: psrld $29, %xmm6
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
; SSE2-NEXT: psrld $30, %xmm0
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: psrad $4, %xmm5
@@ -1277,8 +1264,7 @@ define <16 x i32> @combine_vec_sdiv_by_p
; SSE2-NEXT: psrld $29, %xmm6
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
; SSE2-NEXT: psrld $30, %xmm1
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: psrad $4, %xmm5
@@ -1296,8 +1282,7 @@ define <16 x i32> @combine_vec_sdiv_by_p
; SSE2-NEXT: psrld $29, %xmm6
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
; SSE2-NEXT: psrld $30, %xmm4
-; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
; SSE2-NEXT: paddd %xmm2, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: psrad $4, %xmm5
@@ -1315,8 +1300,7 @@ define <16 x i32> @combine_vec_sdiv_by_p
; SSE2-NEXT: psrld $29, %xmm6
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
; SSE2-NEXT: psrld $30, %xmm5
-; SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5,6,7]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
; SSE2-NEXT: paddd %xmm3, %xmm5
; SSE2-NEXT: movdqa %xmm5, %xmm2
; SSE2-NEXT: psrad $4, %xmm2
@@ -1337,70 +1321,65 @@ define <16 x i32> @combine_vec_sdiv_by_p
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrld $28, %xmm5
-; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: psrld $30, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm6
+; SSE41-NEXT: psrld $30, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
; SSE41-NEXT: psrld $29, %xmm0
-; SSE41-NEXT: pxor %xmm6, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
; SSE41-NEXT: paddd %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrad $4, %xmm5
-; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: psrad $2, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm6
+; SSE41-NEXT: psrad $2, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
; SSE41-NEXT: psrad $3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm4, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm5
; SSE41-NEXT: psrld $28, %xmm5
-; SSE41-NEXT: movdqa %xmm1, %xmm7
-; SSE41-NEXT: psrld $30, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrld $30, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
; SSE41-NEXT: psrld $29, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
; SSE41-NEXT: paddd %xmm4, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm5
; SSE41-NEXT: psrad $4, %xmm5
-; SSE41-NEXT: movdqa %xmm1, %xmm7
-; SSE41-NEXT: psrad $2, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrad $2, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
; SSE41-NEXT: psrad $3, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: psrad $31, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: psrld $28, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: psrld $30, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm4, %xmm6
+; SSE41-NEXT: psrld $30, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
; SSE41-NEXT: psrld $29, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
; SSE41-NEXT: paddd %xmm2, %xmm4
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: psrad $4, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: psrad $2, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm4, %xmm6
+; SSE41-NEXT: psrad $2, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
; SSE41-NEXT: psrad $3, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm3, %xmm5
; SSE41-NEXT: psrad $31, %xmm5
; SSE41-NEXT: movdqa %xmm5, %xmm2
; SSE41-NEXT: psrld $28, %xmm2
-; SSE41-NEXT: movdqa %xmm5, %xmm7
-; SSE41-NEXT: psrld $30, %xmm7
-; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm5, %xmm6
+; SSE41-NEXT: psrld $30, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: psrld $29, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
; SSE41-NEXT: paddd %xmm3, %xmm5
; SSE41-NEXT: movdqa %xmm5, %xmm2
; SSE41-NEXT: psrad $4, %xmm2
@@ -1416,68 +1395,63 @@ define <16 x i32> @combine_vec_sdiv_by_p
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpsrad $31, %xmm3, %xmm2
-; AVX1-NEXT: vpsrld $28, %xmm2, %xmm4
-; AVX1-NEXT: vpsrld $30, %xmm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
+; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpsrld $29, %xmm2, %xmm5
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
+; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpsrad $3, %xmm2, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
+; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
+; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4
; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpsrad $3, %xmm3, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
-; AVX1-NEXT: vpsrld $28, %xmm4, %xmm5
-; AVX1-NEXT: vpsrld $30, %xmm4, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpsrld $29, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
-; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpsrad $4, %xmm4, %xmm5
-; AVX1-NEXT: vpsrad $2, %xmm4, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpsrad $3, %xmm4, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
-; AVX1-NEXT: vpsrld $28, %xmm4, %xmm5
-; AVX1-NEXT: vpsrld $30, %xmm4, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpsrld $29, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
-; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
+; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3
+; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpsrad $3, %xmm2, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
+; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4
+; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4
; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpsrad $3, %xmm3, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
-; AVX1-NEXT: vpsrld $28, %xmm4, %xmm5
-; AVX1-NEXT: vpsrld $30, %xmm4, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpsrld $29, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
-; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $4, %xmm2, %xmm4
-; AVX1-NEXT: vpsrad $2, %xmm2, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; AVX1-NEXT: vpsrad $3, %xmm2, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
; AVX1-NEXT: retq
;
@@ -1559,10 +1533,8 @@ define <2 x i64> @combine_vec_sdiv_by_po
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: psrlq $62, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrlq $62, %xmm2
; SSE2-NEXT: paddq %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: psrlq $2, %xmm1
@@ -1580,24 +1552,21 @@ define <2 x i64> @combine_vec_sdiv_by_po
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE41-NEXT: psrlq $62, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: paddq %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrlq $2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
-; SSE41-NEXT: pxor %xmm2, %xmm1
-; SSE41-NEXT: psubq %xmm2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: paddq %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlq $2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,2305843009213693952]
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: psubq %xmm1, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm1
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
@@ -1667,69 +1636,66 @@ define <2 x i64> @combine_vec_sdiv_by_po
define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: psrlq $61, %xmm3
-; SSE2-NEXT: psrlq $60, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
-; SSE2-NEXT: paddq %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrlq $3, %xmm2
-; SSE2-NEXT: psrlq $4, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
-; SSE2-NEXT: xorpd %xmm2, %xmm1
-; SSE2-NEXT: psubq %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrlq $62, %xmm3
+; SSE2-NEXT: paddq %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: psrlq $2, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952]
+; SSE2-NEXT: xorpd %xmm3, %xmm0
+; SSE2-NEXT: psubq %xmm3, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: psrlq $62, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT: paddq %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: psrlq $2, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: psrlq $61, %xmm3
+; SSE2-NEXT: psrlq $60, %xmm2
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952]
-; SSE2-NEXT: xorpd %xmm3, %xmm2
-; SSE2-NEXT: psubq %xmm3, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: paddq %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: psrlq $3, %xmm1
+; SSE2-NEXT: psrlq $4, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
+; SSE2-NEXT: xorpd %xmm1, %xmm2
+; SSE2-NEXT: psubq %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrlq $60, %xmm3
-; SSE41-NEXT: psrlq $61, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: paddq %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlq $4, %xmm2
-; SSE41-NEXT: psrlq $3, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
-; SSE41-NEXT: pxor %xmm2, %xmm1
-; SSE41-NEXT: psubq %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE41-NEXT: psrlq $62, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: paddq %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: psrlq $2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,2305843009213693952]
-; SSE41-NEXT: pxor %xmm3, %xmm2
-; SSE41-NEXT: psubq %xmm3, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: paddq %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlq $2, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
+; SSE41-NEXT: pxor %xmm2, %xmm3
+; SSE41-NEXT: psubq %xmm2, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlq $60, %xmm3
+; SSE41-NEXT: psrlq $61, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: paddq %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrlq $4, %xmm1
+; SSE41-NEXT: psrlq $3, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: psubq %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
@@ -1747,9 +1713,8 @@ define <4 x i64> @combine_vec_sdiv_by_po
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488]
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
-; AVX1-NEXT: vpsrlq $62, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
@@ -1818,125 +1783,119 @@ define <4 x i64> @combine_vec_sdiv_by_po
define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE2-NEXT: psrlq $62, %xmm5
+; SSE2-NEXT: paddq %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm0
+; SSE2-NEXT: psrlq $2, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm5 = [9223372036854775808,2305843009213693952]
+; SSE2-NEXT: xorpd %xmm5, %xmm0
+; SSE2-NEXT: psubq %xmm5, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: psrlq $61, %xmm5
-; SSE2-NEXT: psrlq $60, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
-; SSE2-NEXT: paddq %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: psrlq $3, %xmm3
-; SSE2-NEXT: psrlq $4, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm5 = [1152921504606846976,576460752303423488]
-; SSE2-NEXT: xorpd %xmm5, %xmm1
-; SSE2-NEXT: psubq %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: psrlq $61, %xmm6
-; SSE2-NEXT: psrlq $60, %xmm3
-; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1]
-; SSE2-NEXT: paddq %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlq $60, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1]
+; SSE2-NEXT: paddq %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: psrlq $3, %xmm4
-; SSE2-NEXT: psrlq $4, %xmm3
-; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
-; SSE2-NEXT: xorpd %xmm5, %xmm3
-; SSE2-NEXT: psubq %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlq $4, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1152921504606846976,576460752303423488]
+; SSE2-NEXT: xorpd %xmm6, %xmm1
+; SSE2-NEXT: psubq %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: psrlq $62, %xmm4
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm4[1]
-; SSE2-NEXT: paddq %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3]
+; SSE2-NEXT: psrlq $62, %xmm7
+; SSE2-NEXT: paddq %xmm2, %xmm7
+; SSE2-NEXT: movdqa %xmm7, %xmm4
; SSE2-NEXT: psrlq $2, %xmm4
-; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,2305843009213693952]
-; SSE2-NEXT: xorpd %xmm7, %xmm4
-; SSE2-NEXT: psubq %xmm7, %xmm4
-; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: psrlq $62, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm0[1]
-; SSE2-NEXT: paddq %xmm2, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: psrlq $2, %xmm5
-; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
-; SSE2-NEXT: xorpd %xmm7, %xmm5
-; SSE2-NEXT: psubq %xmm7, %xmm5
+; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1]
+; SSE2-NEXT: xorpd %xmm5, %xmm4
+; SSE2-NEXT: psubq %xmm5, %xmm4
+; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm5, %xmm2
+; SSE2-NEXT: psrlq $61, %xmm2
+; SSE2-NEXT: psrlq $60, %xmm5
+; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1]
+; SSE2-NEXT: paddq %xmm3, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm2
+; SSE2-NEXT: psrlq $3, %xmm2
+; SSE2-NEXT: psrlq $4, %xmm5
; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1]
-; SSE2-NEXT: movapd %xmm4, %xmm0
-; SSE2-NEXT: movapd %xmm5, %xmm2
+; SSE2-NEXT: xorpd %xmm6, %xmm5
+; SSE2-NEXT: psubq %xmm6, %xmm5
+; SSE2-NEXT: movapd %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm3
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: psrlq $60, %xmm5
-; SSE41-NEXT: psrlq $61, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: paddq %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrlq $4, %xmm3
-; SSE41-NEXT: psrlq $3, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488]
-; SSE41-NEXT: pxor %xmm5, %xmm1
-; SSE41-NEXT: psubq %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: psrlq $62, %xmm1
+; SSE41-NEXT: paddq %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrlq $2, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,2305843009213693952]
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: psubq %xmm5, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm4, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
; SSE41-NEXT: psrlq $60, %xmm6
-; SSE41-NEXT: psrlq $61, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: paddq %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psrlq $61, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT: paddq %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41-NEXT: psrlq $4, %xmm4
-; SSE41-NEXT: psrlq $3, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT: pxor %xmm5, %xmm3
-; SSE41-NEXT: psubq %xmm5, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psrlq $3, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [1152921504606846976,576460752303423488]
+; SSE41-NEXT: pxor %xmm6, %xmm1
+; SSE41-NEXT: psubq %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm4
; SSE41-NEXT: psrad $31, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE41-NEXT: psrlq $62, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT: paddq %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: psrlq $2, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,2305843009213693952]
-; SSE41-NEXT: pxor %xmm4, %xmm6
-; SSE41-NEXT: psubq %xmm4, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: psrad $31, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE41-NEXT: psrlq $62, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: paddq %xmm2, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm5
-; SSE41-NEXT: psrlq $2, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: psubq %xmm4, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: paddq %xmm2, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm7
+; SSE41-NEXT: psrlq $2, %xmm7
+; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm4[0,1,2,3],xmm7[4,5,6,7]
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: psubq %xmm5, %xmm7
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE41-NEXT: movdqa %xmm4, %xmm5
+; SSE41-NEXT: psrlq $60, %xmm5
+; SSE41-NEXT: psrlq $61, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: paddq %xmm3, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm3
+; SSE41-NEXT: psrlq $4, %xmm3
+; SSE41-NEXT: psrlq $3, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: pxor %xmm6, %xmm4
+; SSE41-NEXT: psubq %xmm6, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
@@ -1956,7 +1915,6 @@ define <8 x i64> @combine_vec_sdiv_by_po
; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5
; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
@@ -1976,9 +1934,8 @@ define <8 x i64> @combine_vec_sdiv_by_po
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm4
-; AVX1-NEXT: vpsrlq $62, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2
; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
@@ -2081,8 +2038,7 @@ define <4 x i32> @combine_vec_sdiv_by_po
; SSE2-NEXT: psrld $29, %xmm3
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
; SSE2-NEXT: psrld $30, %xmm0
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $4, %xmm2
@@ -2102,27 +2058,27 @@ define <4 x i32> @combine_vec_sdiv_by_po
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrld $28, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: psrld $30, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: psrld $29, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; SSE41-NEXT: paddd %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad $4, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: psrad $2, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: psrad $3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
-; SSE41-NEXT: psubd %xmm0, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: psrld $29, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrad $4, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: psrad $2, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: psrad $3, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
@@ -2132,17 +2088,16 @@ define <4 x i32> @combine_vec_sdiv_by_po
; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $2, %xmm1, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
@@ -2450,42 +2405,38 @@ define <16 x i8> @non_splat_minus_one_di
define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
; SSE2-LABEL: non_splat_minus_one_divisor_2:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld $31, %xmm2
-; SSE2-NEXT: xorpd %xmm1, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: psrad $1, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT: psubd %xmm2, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[1,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $31, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: psrad $1, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: non_splat_minus_one_divisor_2:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrld $31, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: paddd %xmm0, %xmm1
; SSE41-NEXT: psrad $1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: non_splat_minus_one_divisor_2:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/combine-srem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-srem.ll?rev=347502&r1=347501&r2=347502&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-srem.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-srem.ll Fri Nov 23 12:05:12 2018
@@ -259,20 +259,18 @@ define <4 x i32> @combine_vec_srem_by_po
; SSE-NEXT: psrld $29, %xmm3
; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7]
; SSE-NEXT: psrld $30, %xmm2
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psrad $3, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; SSE-NEXT: paddd %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: psrad $3, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: psrad $1, %xmm3
-; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT: psrad $2, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1
-; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; SSE-NEXT: psrad $2, %xmm2
+; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE-NEXT: pmulld {{.*}}(%rip), %xmm2
+; SSE-NEXT: psubd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_srem_by_pow2b:
@@ -282,8 +280,6 @@ define <4 x i32> @combine_vec_srem_by_po
; AVX1-NEXT: vpsrld $29, %xmm2, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpsrld $30, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
Modified: llvm/trunk/test/CodeGen/X86/scheduler-backtracking.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/scheduler-backtracking.ll?rev=347502&r1=347501&r2=347502&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/scheduler-backtracking.ll (original)
+++ llvm/trunk/test/CodeGen/X86/scheduler-backtracking.ll Fri Nov 23 12:05:12 2018
@@ -12,58 +12,53 @@
define i256 @test1(i256 %a) nounwind {
; ILP-LABEL: test1:
; ILP: # %bb.0:
-; ILP-NEXT: pushq %rbp
+; ILP-NEXT: pushq %r14
; ILP-NEXT: pushq %rbx
; ILP-NEXT: movq %rdi, %rax
-; ILP-NEXT: leal 3(%rsi,%rsi), %ebp
-; ILP-NEXT: movl %ebp, %r11d
-; ILP-NEXT: addb $-128, %r11b
; ILP-NEXT: xorl %r8d, %r8d
-; ILP-NEXT: movl $1, %r10d
-; ILP-NEXT: xorl %edi, %edi
-; ILP-NEXT: movl %ebp, %ecx
-; ILP-NEXT: shldq %cl, %r10, %rdi
-; ILP-NEXT: xorl %r9d, %r9d
+; ILP-NEXT: leal 3(%rsi,%rsi), %r11d
+; ILP-NEXT: movl $1, %r9d
+; ILP-NEXT: xorl %r14d, %r14d
; ILP-NEXT: movl %r11d, %ecx
-; ILP-NEXT: shldq %cl, %r10, %r9
+; ILP-NEXT: shldq %cl, %r9, %r14
+; ILP-NEXT: movl $1, %edi
+; ILP-NEXT: shlq %cl, %rdi
+; ILP-NEXT: movb $-128, %r10b
+; ILP-NEXT: subb %r11b, %r10b
+; ILP-NEXT: movl %r11d, %edx
+; ILP-NEXT: addb $-128, %dl
; ILP-NEXT: xorl %esi, %esi
-; ILP-NEXT: movl %ebp, %ecx
-; ILP-NEXT: shldq %cl, %rsi, %rsi
-; ILP-NEXT: movl $1, %edx
-; ILP-NEXT: shlq %cl, %rdx
+; ILP-NEXT: movl %edx, %ecx
+; ILP-NEXT: shldq %cl, %r9, %rsi
; ILP-NEXT: movl $1, %ebx
-; ILP-NEXT: movl %r11d, %ecx
; ILP-NEXT: shlq %cl, %rbx
-; ILP-NEXT: movb $-128, %cl
-; ILP-NEXT: subb %bpl, %cl
-; ILP-NEXT: shrdq %cl, %r8, %r10
-; ILP-NEXT: testb $64, %cl
-; ILP-NEXT: cmovneq %r8, %r10
+; ILP-NEXT: movl %r10d, %ecx
+; ILP-NEXT: shrdq %cl, %r8, %r9
; ILP-NEXT: testb $64, %r11b
-; ILP-NEXT: cmovneq %rbx, %r9
+; ILP-NEXT: cmovneq %rdi, %r14
+; ILP-NEXT: cmovneq %r8, %rdi
+; ILP-NEXT: testb $64, %r10b
+; ILP-NEXT: cmovneq %r8, %r9
+; ILP-NEXT: testb $64, %dl
+; ILP-NEXT: cmovneq %rbx, %rsi
; ILP-NEXT: cmovneq %r8, %rbx
-; ILP-NEXT: testb $64, %bpl
-; ILP-NEXT: cmovneq %rdx, %rdi
-; ILP-NEXT: cmovneq %r8, %rdx
-; ILP-NEXT: cmovneq %r8, %rsi
-; ILP-NEXT: testb %bpl, %bpl
+; ILP-NEXT: testb %r11b, %r11b
+; ILP-NEXT: cmovsq %r8, %r14
; ILP-NEXT: cmovsq %r8, %rdi
-; ILP-NEXT: cmovsq %r8, %rdx
-; ILP-NEXT: movq %rdi, 8(%rax)
-; ILP-NEXT: movq %rdx, (%rax)
-; ILP-NEXT: cmovsq %r9, %rsi
+; ILP-NEXT: movq %r14, 8(%rax)
+; ILP-NEXT: movq %rdi, (%rax)
+; ILP-NEXT: cmovnsq %r8, %rsi
; ILP-NEXT: cmoveq %r8, %rsi
; ILP-NEXT: movq %rsi, 24(%rax)
-; ILP-NEXT: cmovnsq %r10, %rbx
+; ILP-NEXT: cmovnsq %r9, %rbx
; ILP-NEXT: cmoveq %r8, %rbx
; ILP-NEXT: movq %rbx, 16(%rax)
; ILP-NEXT: popq %rbx
-; ILP-NEXT: popq %rbp
+; ILP-NEXT: popq %r14
; ILP-NEXT: retq
;
; HYBRID-LABEL: test1:
; HYBRID: # %bb.0:
-; HYBRID-NEXT: pushq %rbx
; HYBRID-NEXT: movq %rdi, %rax
; HYBRID-NEXT: leal 3(%rsi,%rsi), %r10d
; HYBRID-NEXT: movb $-128, %cl
@@ -74,42 +69,37 @@ define i256 @test1(i256 %a) nounwind {
; HYBRID-NEXT: shrdq %cl, %r8, %r9
; HYBRID-NEXT: testb $64, %cl
; HYBRID-NEXT: cmovneq %r8, %r9
-; HYBRID-NEXT: xorl %edx, %edx
-; HYBRID-NEXT: movl %r10d, %ecx
-; HYBRID-NEXT: shldq %cl, %rsi, %rdx
-; HYBRID-NEXT: addb $-128, %cl
; HYBRID-NEXT: xorl %r11d, %r11d
+; HYBRID-NEXT: movl %r10d, %ecx
; HYBRID-NEXT: shldq %cl, %rsi, %r11
+; HYBRID-NEXT: addb $-128, %cl
+; HYBRID-NEXT: xorl %edx, %edx
+; HYBRID-NEXT: shldq %cl, %rsi, %rdx
; HYBRID-NEXT: movl $1, %edi
; HYBRID-NEXT: shlq %cl, %rdi
; HYBRID-NEXT: testb $64, %cl
-; HYBRID-NEXT: cmovneq %rdi, %r11
+; HYBRID-NEXT: cmovneq %rdi, %rdx
; HYBRID-NEXT: cmovneq %r8, %rdi
-; HYBRID-NEXT: xorl %ebx, %ebx
; HYBRID-NEXT: movl %r10d, %ecx
-; HYBRID-NEXT: shldq %cl, %rbx, %rbx
; HYBRID-NEXT: shlq %cl, %rsi
; HYBRID-NEXT: testb $64, %r10b
-; HYBRID-NEXT: cmovneq %rsi, %rdx
-; HYBRID-NEXT: cmovneq %r8, %rbx
+; HYBRID-NEXT: cmovneq %rsi, %r11
; HYBRID-NEXT: cmovneq %r8, %rsi
; HYBRID-NEXT: testb %r10b, %r10b
-; HYBRID-NEXT: cmovsq %r8, %rdx
-; HYBRID-NEXT: movq %rdx, 8(%rax)
+; HYBRID-NEXT: cmovsq %r8, %r11
+; HYBRID-NEXT: movq %r11, 8(%rax)
; HYBRID-NEXT: cmovsq %r8, %rsi
; HYBRID-NEXT: movq %rsi, (%rax)
-; HYBRID-NEXT: cmovsq %r11, %rbx
-; HYBRID-NEXT: cmoveq %r8, %rbx
-; HYBRID-NEXT: movq %rbx, 24(%rax)
+; HYBRID-NEXT: cmovnsq %r8, %rdx
+; HYBRID-NEXT: cmoveq %r8, %rdx
+; HYBRID-NEXT: movq %rdx, 24(%rax)
; HYBRID-NEXT: cmovnsq %r9, %rdi
; HYBRID-NEXT: cmoveq %r8, %rdi
; HYBRID-NEXT: movq %rdi, 16(%rax)
-; HYBRID-NEXT: popq %rbx
; HYBRID-NEXT: retq
;
; BURR-LABEL: test1:
; BURR: # %bb.0:
-; BURR-NEXT: pushq %rbx
; BURR-NEXT: movq %rdi, %rax
; BURR-NEXT: leal 3(%rsi,%rsi), %r10d
; BURR-NEXT: movb $-128, %cl
@@ -120,42 +110,37 @@ define i256 @test1(i256 %a) nounwind {
; BURR-NEXT: shrdq %cl, %r8, %r9
; BURR-NEXT: testb $64, %cl
; BURR-NEXT: cmovneq %r8, %r9
-; BURR-NEXT: xorl %edx, %edx
-; BURR-NEXT: movl %r10d, %ecx
-; BURR-NEXT: shldq %cl, %rsi, %rdx
-; BURR-NEXT: addb $-128, %cl
; BURR-NEXT: xorl %r11d, %r11d
+; BURR-NEXT: movl %r10d, %ecx
; BURR-NEXT: shldq %cl, %rsi, %r11
+; BURR-NEXT: addb $-128, %cl
+; BURR-NEXT: xorl %edx, %edx
+; BURR-NEXT: shldq %cl, %rsi, %rdx
; BURR-NEXT: movl $1, %edi
; BURR-NEXT: shlq %cl, %rdi
; BURR-NEXT: testb $64, %cl
-; BURR-NEXT: cmovneq %rdi, %r11
+; BURR-NEXT: cmovneq %rdi, %rdx
; BURR-NEXT: cmovneq %r8, %rdi
-; BURR-NEXT: xorl %ebx, %ebx
; BURR-NEXT: movl %r10d, %ecx
-; BURR-NEXT: shldq %cl, %rbx, %rbx
; BURR-NEXT: shlq %cl, %rsi
; BURR-NEXT: testb $64, %r10b
-; BURR-NEXT: cmovneq %rsi, %rdx
-; BURR-NEXT: cmovneq %r8, %rbx
+; BURR-NEXT: cmovneq %rsi, %r11
; BURR-NEXT: cmovneq %r8, %rsi
; BURR-NEXT: testb %r10b, %r10b
-; BURR-NEXT: cmovsq %r8, %rdx
-; BURR-NEXT: movq %rdx, 8(%rax)
+; BURR-NEXT: cmovsq %r8, %r11
+; BURR-NEXT: movq %r11, 8(%rax)
; BURR-NEXT: cmovsq %r8, %rsi
; BURR-NEXT: movq %rsi, (%rax)
-; BURR-NEXT: cmovsq %r11, %rbx
-; BURR-NEXT: cmoveq %r8, %rbx
-; BURR-NEXT: movq %rbx, 24(%rax)
+; BURR-NEXT: cmovnsq %r8, %rdx
+; BURR-NEXT: cmoveq %r8, %rdx
+; BURR-NEXT: movq %rdx, 24(%rax)
; BURR-NEXT: cmovnsq %r9, %rdi
; BURR-NEXT: cmoveq %r8, %rdi
; BURR-NEXT: movq %rdi, 16(%rax)
-; BURR-NEXT: popq %rbx
; BURR-NEXT: retq
;
; SRC-LABEL: test1:
; SRC: # %bb.0:
-; SRC-NEXT: pushq %r14
; SRC-NEXT: pushq %rbx
; SRC-NEXT: movq %rdi, %rax
; SRC-NEXT: leal 3(%rsi,%rsi), %r9d
@@ -175,14 +160,11 @@ define i256 @test1(i256 %a) nounwind {
; SRC-NEXT: xorl %edx, %edx
; SRC-NEXT: movl %r9d, %ecx
; SRC-NEXT: shldq %cl, %rdi, %rdx
-; SRC-NEXT: xorl %r14d, %r14d
-; SRC-NEXT: shldq %cl, %r14, %r14
; SRC-NEXT: movl $1, %ebx
; SRC-NEXT: shlq %cl, %rbx
; SRC-NEXT: testb $64, %r9b
; SRC-NEXT: cmovneq %rbx, %rdx
; SRC-NEXT: cmovneq %r8, %rbx
-; SRC-NEXT: cmovneq %r8, %r14
; SRC-NEXT: movl %r11d, %ecx
; SRC-NEXT: shlq %cl, %rdi
; SRC-NEXT: testb $64, %r11b
@@ -191,7 +173,7 @@ define i256 @test1(i256 %a) nounwind {
; SRC-NEXT: testb %r9b, %r9b
; SRC-NEXT: cmovnsq %r10, %rdi
; SRC-NEXT: cmoveq %r8, %rdi
-; SRC-NEXT: cmovnsq %r14, %rsi
+; SRC-NEXT: cmovnsq %r8, %rsi
; SRC-NEXT: cmoveq %r8, %rsi
; SRC-NEXT: cmovsq %r8, %rdx
; SRC-NEXT: cmovsq %r8, %rbx
@@ -200,7 +182,6 @@ define i256 @test1(i256 %a) nounwind {
; SRC-NEXT: movq %rsi, 24(%rax)
; SRC-NEXT: movq %rdi, 16(%rax)
; SRC-NEXT: popq %rbx
-; SRC-NEXT: popq %r14
; SRC-NEXT: retq
;
; LIN-LABEL: test1:
@@ -208,48 +189,44 @@ define i256 @test1(i256 %a) nounwind {
; LIN-NEXT: movq %rdi, %rax
; LIN-NEXT: xorl %r9d, %r9d
; LIN-NEXT: movl $1, %r8d
-; LIN-NEXT: leal 3(%rsi,%rsi), %r11d
-; LIN-NEXT: movl $1, %edx
-; LIN-NEXT: movl %r11d, %ecx
-; LIN-NEXT: shlq %cl, %rdx
-; LIN-NEXT: testb $64, %r11b
-; LIN-NEXT: movq %rdx, %rcx
+; LIN-NEXT: leal 3(%rsi,%rsi), %edx
+; LIN-NEXT: movl $1, %esi
+; LIN-NEXT: movl %edx, %ecx
+; LIN-NEXT: shlq %cl, %rsi
+; LIN-NEXT: testb $64, %dl
+; LIN-NEXT: movq %rsi, %rcx
; LIN-NEXT: cmovneq %r9, %rcx
-; LIN-NEXT: testb %r11b, %r11b
+; LIN-NEXT: testb %dl, %dl
; LIN-NEXT: cmovsq %r9, %rcx
; LIN-NEXT: movq %rcx, (%rdi)
; LIN-NEXT: xorl %edi, %edi
-; LIN-NEXT: movl %r11d, %ecx
+; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shldq %cl, %r8, %rdi
-; LIN-NEXT: cmovneq %rdx, %rdi
+; LIN-NEXT: cmovneq %rsi, %rdi
; LIN-NEXT: cmovsq %r9, %rdi
; LIN-NEXT: movq %rdi, 8(%rax)
-; LIN-NEXT: movl %r11d, %edx
-; LIN-NEXT: addb $-128, %dl
+; LIN-NEXT: movl %edx, %esi
+; LIN-NEXT: addb $-128, %sil
; LIN-NEXT: movl $1, %r10d
-; LIN-NEXT: movl %edx, %ecx
+; LIN-NEXT: movl %esi, %ecx
; LIN-NEXT: shlq %cl, %r10
-; LIN-NEXT: testb $64, %dl
+; LIN-NEXT: testb $64, %sil
; LIN-NEXT: movq %r10, %rdi
; LIN-NEXT: cmovneq %r9, %rdi
; LIN-NEXT: movb $-128, %cl
-; LIN-NEXT: subb %r11b, %cl
-; LIN-NEXT: movl $1, %esi
-; LIN-NEXT: shrdq %cl, %r9, %rsi
+; LIN-NEXT: subb %dl, %cl
+; LIN-NEXT: movl $1, %edx
+; LIN-NEXT: shrdq %cl, %r9, %rdx
; LIN-NEXT: testb $64, %cl
-; LIN-NEXT: cmovneq %r9, %rsi
-; LIN-NEXT: cmovsq %rdi, %rsi
-; LIN-NEXT: cmoveq %r9, %rsi
-; LIN-NEXT: movq %rsi, 16(%rax)
-; LIN-NEXT: xorl %esi, %esi
-; LIN-NEXT: movl %edx, %ecx
-; LIN-NEXT: shldq %cl, %r8, %rsi
-; LIN-NEXT: cmovneq %r10, %rsi
-; LIN-NEXT: xorl %edx, %edx
-; LIN-NEXT: movl %r11d, %ecx
-; LIN-NEXT: shldq %cl, %rdx, %rdx
; LIN-NEXT: cmovneq %r9, %rdx
-; LIN-NEXT: cmovsq %rsi, %rdx
+; LIN-NEXT: cmovsq %rdi, %rdx
+; LIN-NEXT: cmoveq %r9, %rdx
+; LIN-NEXT: movq %rdx, 16(%rax)
+; LIN-NEXT: xorl %edx, %edx
+; LIN-NEXT: movl %esi, %ecx
+; LIN-NEXT: shldq %cl, %r8, %rdx
+; LIN-NEXT: cmovneq %r10, %rdx
+; LIN-NEXT: cmovnsq %r9, %rdx
; LIN-NEXT: cmoveq %r9, %rdx
; LIN-NEXT: movq %rdx, 24(%rax)
; LIN-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/shift-folding.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shift-folding.ll?rev=347502&r1=347501&r2=347502&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shift-folding.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shift-folding.ll Fri Nov 23 12:05:12 2018
@@ -71,3 +71,15 @@ define i64 @test5(i16 %i, i32* %arr) {
%sum = add i64 %val.zext, %index.zext
ret i64 %sum
}
+
+; We should not crash because an undef shift was created.
+
+define i32 @overshift(i32 %a) {
+; CHECK-LABEL: overshift:
+; CHECK: # %bb.0:
+; CHECK-NEXT: retl
+ %shr = lshr i32 %a, 33
+ %xor = xor i32 1, %shr
+ ret i32 %xor
+}
+
More information about the llvm-commits
mailing list