[llvm] r314139 - [SelectionDAG] Teach simplifyDemandedBits to handle shifts by constant splat vectors

Mon Sep 25 12:26:08 PDT 2017

Author: ctopper
Date: Mon Sep 25 12:26:08 2017
New Revision: 314139

URL: http://llvm.org/viewvc/llvm-project?rev=314139&view=rev
Log:
[SelectionDAG] Teach simplifyDemandedBits to handle shifts by constant splat vectors

This teach simplifyDemandedBits to handle constant splat vector shifts.

This required changing some uses of getZExtValue to getLimitedValue since we can't rely on legalization using getShiftAmountTy for the shift amount.

I believe there may have been a bug in the ((X << C1) >>u ShAmt) handling where we didn't check if the inner shift was too large. I've fixed that here.

I had to add new patterns to ARM because the zext/sext the patterns were trying to look for got turned into an any_extend with this patch. Happy to split that out too, but not sure how to test without this change.

Differential Revision: https://reviews.llvm.org/D37665

Modified:
    llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
    llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
    llvm/trunk/test/CodeGen/ARM/vshll.ll
    llvm/trunk/test/CodeGen/X86/combine-shl.ll
    llvm/trunk/test/CodeGen/X86/not-and-simplify.ll
    llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll
    llvm/trunk/test/CodeGen/X86/vector-blend.ll
    llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll
    llvm/trunk/test/CodeGen/X86/vector-rotate-256.ll
    llvm/trunk/test/CodeGen/X86/widen_cast-4.ll

Modified: llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h?rev=314139&r1=314138&r2=314139&view=diff
==============================================================================

--- llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h (original)
+++ llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h Mon Sep 25 12:26:08 2017
@@ -1434,6 +1434,9 @@ public:
   const APInt &getAPIntValue() const { return Value->getValue(); }
   uint64_t getZExtValue() const { return Value->getZExtValue(); }
   int64_t getSExtValue() const { return Value->getSExtValue(); }
+  uint64_t getLimitedValue(uint64_t Limit = UINT64_MAX) {
+    return Value->getLimitedValue(Limit);
+  }
 
   bool isOne() const { return Value->isOne(); }
   bool isNullValue() const { return Value->isZero(); }

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=314139&r1=314138&r2=314139&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Mon Sep 25 12:26:08 2017
@@ -779,33 +779,38 @@ bool TargetLowering::SimplifyDemandedBit
     break;
   }
   case ISD::SHL:
-    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
-      unsigned ShAmt = SA->getZExtValue();
+    if (ConstantSDNode *SA = isConstOrConstSplat(Op.getOperand(1))) {
       SDValue InOp = Op.getOperand(0);
 
       // If the shift count is an invalid immediate, don't do anything.
-      if (ShAmt >= BitWidth)
+      if (SA->getAPIntValue().uge(BitWidth))
         break;
 
+      unsigned ShAmt = SA->getZExtValue();
+
       // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
       // single shift.  We can do this if the bottom bits (which are shifted
       // out) are never demanded.
-      if (InOp.getOpcode() == ISD::SRL &&
-          isa<ConstantSDNode>(InOp.getOperand(1))) {
-        if (ShAmt && (NewMask & APInt::getLowBitsSet(BitWidth, ShAmt)) == 0) {
-          unsigned C1= cast<ConstantSDNode>(InOp.getOperand(1))->getZExtValue();
-          unsigned Opc = ISD::SHL;
-          int Diff = ShAmt-C1;
-          if (Diff < 0) {
-            Diff = -Diff;
-            Opc = ISD::SRL;
+      if (InOp.getOpcode() == ISD::SRL) {
+        if (ConstantSDNode *SA2 = isConstOrConstSplat(InOp.getOperand(1))) {
+          if (ShAmt && (NewMask & APInt::getLowBitsSet(BitWidth, ShAmt)) == 0) {
+            if (SA2->getAPIntValue().ult(BitWidth)) {
+              unsigned C1 = SA2->getZExtValue();
+              unsigned Opc = ISD::SHL;
+              int Diff = ShAmt-C1;
+              if (Diff < 0) {
+                Diff = -Diff;
+                Opc = ISD::SRL;
+              }
+
+              SDValue NewSA =
+                TLO.DAG.getConstant(Diff, dl, Op.getOperand(1).getValueType());
+              EVT VT = Op.getValueType();
+              return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
+                                                       InOp.getOperand(0),
+                                                       NewSA));
+            }
           }
-
-          SDValue NewSA =
-            TLO.DAG.getConstant(Diff, dl, Op.getOperand(1).getValueType());
-          EVT VT = Op.getValueType();
-          return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
-                                                   InOp.getOperand(0), NewSA));
         }
       }
 
@@ -817,7 +822,7 @@ bool TargetLowering::SimplifyDemandedBit
       if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) {
         SDValue InnerOp = InOp.getOperand(0);
         EVT InnerVT = InnerOp.getValueType();
-        unsigned InnerBits = InnerVT.getSizeInBits();
+        unsigned InnerBits = InnerVT.getScalarSizeInBits();
         if (ShAmt < InnerBits && NewMask.getActiveBits() <= InnerBits &&
             isTypeDesirableForOp(ISD::SHL, InnerVT)) {
           EVT ShTy = getShiftAmountTy(InnerVT, DL);
@@ -836,45 +841,42 @@ bool TargetLowering::SimplifyDemandedBit
         // (shl (anyext x), c2-c1).  This requires that the bottom c1 bits
         // aren't demanded (as above) and that the shifted upper c1 bits of
         // x aren't demanded.
-        if (InOp.hasOneUse() &&
-            InnerOp.getOpcode() == ISD::SRL &&
-            InnerOp.hasOneUse() &&
-            isa<ConstantSDNode>(InnerOp.getOperand(1))) {
-          unsigned InnerShAmt = cast<ConstantSDNode>(InnerOp.getOperand(1))
-            ->getZExtValue();
-          if (InnerShAmt < ShAmt &&
-              InnerShAmt < InnerBits &&
-              NewMask.getActiveBits() <= (InnerBits - InnerShAmt + ShAmt) &&
-              NewMask.countTrailingZeros() >= ShAmt) {
-            SDValue NewSA =
-              TLO.DAG.getConstant(ShAmt - InnerShAmt, dl,
-                                  Op.getOperand(1).getValueType());
-            EVT VT = Op.getValueType();
-            SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
-                                             InnerOp.getOperand(0));
-            return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT,
-                                                     NewExt, NewSA));
+        if (InOp.hasOneUse() && InnerOp.getOpcode() == ISD::SRL &&
+            InnerOp.hasOneUse()) {
+          if (ConstantSDNode *SA2 = isConstOrConstSplat(InnerOp.getOperand(1))) {
+            unsigned InnerShAmt = SA2->getLimitedValue(InnerBits);
+            if (InnerShAmt < ShAmt &&
+                InnerShAmt < InnerBits &&
+                NewMask.getActiveBits() <= (InnerBits - InnerShAmt + ShAmt) &&
+                NewMask.countTrailingZeros() >= ShAmt) {
+              SDValue NewSA =
+                TLO.DAG.getConstant(ShAmt - InnerShAmt, dl,
+                                    Op.getOperand(1).getValueType());
+              EVT VT = Op.getValueType();
+              SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
+                                               InnerOp.getOperand(0));
+              return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT,
+                                                       NewExt, NewSA));
+            }
           }
         }
       }
 
-      Known.Zero <<= SA->getZExtValue();
-      Known.One  <<= SA->getZExtValue();
+      Known.Zero <<= ShAmt;
+      Known.One  <<= ShAmt;
       // low bits known zero.
-      Known.Zero.setLowBits(SA->getZExtValue());
+      Known.Zero.setLowBits(ShAmt);
     }
     break;
   case ISD::SRL:
-    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
-      EVT VT = Op.getValueType();
-      unsigned ShAmt = SA->getZExtValue();
-      unsigned VTSize = VT.getSizeInBits();
+    if (ConstantSDNode *SA = isConstOrConstSplat(Op.getOperand(1))) {
       SDValue InOp = Op.getOperand(0);
 
       // If the shift count is an invalid immediate, don't do anything.
-      if (ShAmt >= BitWidth)
+      if (SA->getAPIntValue().uge(BitWidth))
         break;
 
+      unsigned ShAmt = SA->getZExtValue();
       APInt InDemandedMask = (NewMask << ShAmt);
 
       // If the shift is exact, then it does demand the low bits (and knows that
@@ -885,21 +887,27 @@ bool TargetLowering::SimplifyDemandedBit
       // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
       // single shift.  We can do this if the top bits (which are shifted out)
       // are never demanded.
-      if (InOp.getOpcode() == ISD::SHL &&
-          isa<ConstantSDNode>(InOp.getOperand(1))) {
-        if (ShAmt && (NewMask & APInt::getHighBitsSet(VTSize, ShAmt)) == 0) {
-          unsigned C1= cast<ConstantSDNode>(InOp.getOperand(1))->getZExtValue();
-          unsigned Opc = ISD::SRL;
-          int Diff = ShAmt-C1;
-          if (Diff < 0) {
-            Diff = -Diff;
-            Opc = ISD::SHL;
+      if (InOp.getOpcode() == ISD::SHL) {
+        if (ConstantSDNode *SA2 = isConstOrConstSplat(InOp.getOperand(1))) {
+          if (ShAmt &&
+              (NewMask & APInt::getHighBitsSet(BitWidth, ShAmt)) == 0) {
+            if (SA2->getAPIntValue().ult(BitWidth)) {
+              unsigned C1 = SA2->getZExtValue();
+              unsigned Opc = ISD::SRL;
+              int Diff = ShAmt-C1;
+              if (Diff < 0) {
+                Diff = -Diff;
+                Opc = ISD::SHL;
+              }
+
+              SDValue NewSA =
+                TLO.DAG.getConstant(Diff, dl, Op.getOperand(1).getValueType());
+              EVT VT = Op.getValueType();
+              return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
+                                                       InOp.getOperand(0),
+                                                       NewSA));
+            }
           }
-
-          SDValue NewSA =
-            TLO.DAG.getConstant(Diff, dl, Op.getOperand(1).getValueType());
-          return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
-                                                   InOp.getOperand(0), NewSA));
         }
       }
 
@@ -923,14 +931,14 @@ bool TargetLowering::SimplifyDemandedBit
                            TLO.DAG.getNode(ISD::SRL, dl, Op.getValueType(),
                                            Op.getOperand(0), Op.getOperand(1)));
 
-    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+    if (ConstantSDNode *SA = isConstOrConstSplat(Op.getOperand(1))) {
       EVT VT = Op.getValueType();
-      unsigned ShAmt = SA->getZExtValue();
 
       // If the shift count is an invalid immediate, don't do anything.
-      if (ShAmt >= BitWidth)
+      if (SA->getAPIntValue().uge(BitWidth))
         break;
 
+      unsigned ShAmt = SA->getZExtValue();
       APInt InDemandedMask = (NewMask << ShAmt);
 
       // If the shift is exact, then it does demand the low bits (and knows that

Modified: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrNEON.td?rev=314139&r1=314138&r2=314139&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td Mon Sep 25 12:26:08 2017
@@ -5529,6 +5529,12 @@ def : Pat<(v4i32 (NEONvshl (sext (v4i16
           (VSHLLi16 DPR:$Rn, 16)>;
 def : Pat<(v2i64 (NEONvshl (sext (v2i32 DPR:$Rn)), (i32 32))),
           (VSHLLi32 DPR:$Rn, 32)>;
+def : Pat<(v8i16 (NEONvshl (anyext (v8i8 DPR:$Rn)), (i32 8))),
+          (VSHLLi8 DPR:$Rn, 8)>;
+def : Pat<(v4i32 (NEONvshl (anyext (v4i16 DPR:$Rn)), (i32 16))),
+          (VSHLLi16 DPR:$Rn, 16)>;
+def : Pat<(v2i64 (NEONvshl (anyext (v2i32 DPR:$Rn)), (i32 32))),
+          (VSHLLi32 DPR:$Rn, 32)>;
 
 //   VSHRN    : Vector Shift Right and Narrow
 defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i",

Modified: llvm/trunk/test/CodeGen/ARM/vshll.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vshll.ll?rev=314139&r1=314138&r2=314139&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/vshll.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/vshll.ll Mon Sep 25 12:26:08 2017
@@ -97,7 +97,7 @@ define <8 x i16> @vshllu8_bad(<8 x i8>*
 
 define <4 x i32> @vshlls16_bad(<4 x i16>* %A) nounwind {
 ; CHECK-LABEL: vshlls16_bad:
-; CHECK: vmovl.s16
+; CHECK: vmovl.u16
 ; CHECK: vshl.i32
         %tmp1 = load <4 x i16>, <4 x i16>* %A
         %sext = sext <4 x i16> %tmp1 to <4 x i32>

Modified: llvm/trunk/test/CodeGen/X86/combine-shl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-shl.ll?rev=314139&r1=314138&r2=314139&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-shl.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-shl.ll Mon Sep 25 12:26:08 2017
@@ -193,17 +193,16 @@ define <4 x i32> @combine_vec_shl_shl_ze
 define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) {
 ; SSE-LABEL: combine_vec_shl_ext_shl0:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pmovsxwd %xmm0, %xmm2
-; SSE-NEXT:    pslld $20, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE-NEXT:    pmovsxwd %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE-NEXT:    pslld $20, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pslld $20, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_shl_ext_shl0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX-NEXT:    vpslld $20, %ymm0, %ymm0
 ; AVX-NEXT:    retq
   %1 = shl <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>

Modified: llvm/trunk/test/CodeGen/X86/not-and-simplify.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/not-and-simplify.ll?rev=314139&r1=314138&r2=314139&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/not-and-simplify.ll (original)
+++ llvm/trunk/test/CodeGen/X86/not-and-simplify.ll Mon Sep 25 12:26:08 2017
@@ -47,9 +47,7 @@ define i8 @shrink_xor_constant2(i8 %x) {
 define <16 x i8> @shrink_xor_constant2_splat(<16 x i8> %x) {
 ; ALL-LABEL: shrink_xor_constant2_splat:
 ; ALL:       # BB#0:
-; ALL-NEXT:    psllw $5, %xmm0
-; ALL-NEXT:    pand {{.*}}(%rip), %xmm0
-; ALL-NEXT:    pandn {{.*}}(%rip), %xmm0
+; ALL-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; ALL-NEXT:    retq
   %sh = shl <16 x i8> %x, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
   %not = xor <16 x i8> %sh, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>

Modified: llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll?rev=314139&r1=314138&r2=314139&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse2-vector-shifts.ll Mon Sep 25 12:26:08 2017
@@ -336,7 +336,6 @@ define <4 x i16> @sra_trunc_srl_v4i32(<4
 define <4 x i32> @shl_zext_shl_v4i32(<4 x i16> %x) nounwind {
 ; CHECK-LABEL: shl_zext_shl_v4i32:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    pslld $19, %xmm0
 ; CHECK-NEXT:    retq
   %shl0 = shl <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>

Modified: llvm/trunk/test/CodeGen/X86/vector-blend.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-blend.ll?rev=314139&r1=314138&r2=314139&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-blend.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-blend.ll Mon Sep 25 12:26:08 2017
@@ -985,17 +985,15 @@ define <4 x i32> @blend_neg_logic_v4i32_
 ; SSE41-LABEL: blend_neg_logic_v4i32_2:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    psrad $31, %xmm1
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    psubd %xmm2, %xmm3
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    movaps %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: blend_neg_logic_v4i32_2:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm2
 ; AVX-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0

Modified: llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll?rev=314139&r1=314138&r2=314139&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll Mon Sep 25 12:26:08 2017
@@ -1559,13 +1559,8 @@ define <2 x i64> @splatconstant_rotate_m
 ;
 ; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    psllq $15, %xmm1
 ; X32-SSE-NEXT:    psrlq $49, %xmm0
 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT:    por %xmm0, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %shl = shl <2 x i64> %a, <i64 15, i64 15>
   %lshr = lshr <2 x i64> %a, <i64 49, i64 49>
@@ -1581,7 +1576,6 @@ define <4 x i32> @splatconstant_rotate_m
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    pslld $4, %xmm1
 ; SSE-NEXT:    psrld $28, %xmm0
-; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE-NEXT:    por %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
@@ -1591,7 +1585,6 @@ define <4 x i32> @splatconstant_rotate_m
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpslld $4, %xmm0, %xmm1
 ; AVX-NEXT:    vpsrld $28, %xmm0, %xmm0
-; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
@@ -1621,7 +1614,6 @@ define <4 x i32> @splatconstant_rotate_m
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    pslld $4, %xmm1
 ; X32-SSE-NEXT:    psrld $28, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
 ; X32-SSE-NEXT:    por %xmm0, %xmm1
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm0

Modified: llvm/trunk/test/CodeGen/X86/vector-rotate-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-rotate-256.ll?rev=314139&r1=314138&r2=314139&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-rotate-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-rotate-256.ll Mon Sep 25 12:26:08 2017
@@ -997,10 +997,10 @@ define <32 x i8> @splatconstant_rotate_v
 define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
 ; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrlq $49, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpsrlq $49, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlq $49, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/widen_cast-4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_cast-4.ll?rev=314139&r1=314138&r2=314139&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_cast-4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_cast-4.ll Mon Sep 25 12:26:08 2017
@@ -26,7 +26,7 @@ define void @update(i64* %dst_i, i64* %s
 ; NARROW-NEXT:    psubw %xmm0, %xmm2
 ; NARROW-NEXT:    psllw $8, %xmm2
 ; NARROW-NEXT:    psraw $8, %xmm2
-; NARROW-NEXT:    psraw $2, %xmm2
+; NARROW-NEXT:    psrlw $2, %xmm2
 ; NARROW-NEXT:    pshufb %xmm1, %xmm2
 ; NARROW-NEXT:    movq %xmm2, (%edx,%eax,8)
 ; NARROW-NEXT:    incl (%esp)