[llvm] r288250 - [X86][SSE] Add support for target shuffle constant folding

Wed Nov 30 08:33:46 PST 2016

Author: rksimon
Date: Wed Nov 30 10:33:46 2016
New Revision: 288250

URL: http://llvm.org/viewvc/llvm-project?rev=288250&view=rev
Log:
[X86][SSE] Add support for target shuffle constant folding

Initial support for target shuffle constant folding in cases where all shuffle inputs are constant. We may be able to relax this and merge shuffles with only some constant inputs in the future.

I've added the helper function getTargetConstantBitsFromNode (based off a similar function in X86ShuffleDecodeConstantPool.cpp) that could be reused for other cases requiring constant vector extraction.

Differential Revision: https://reviews.llvm.org/D27220

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll
    llvm/trunk/test/CodeGen/X86/vselect-avx.ll
    llvm/trunk/test/CodeGen/X86/widen_load-2.ll
    llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=288250&r1=288249&r2=288250&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Nov 30 10:33:46 2016
@@ -4622,9 +4622,9 @@ static SDValue getConstVector(ArrayRef<i
   return ConstsNode;
 }
 
-static SDValue getConstVector(ArrayRef<APInt> Values, SmallBitVector &Undefs,
+static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
-  assert(Values.size() == Undefs.size() && "Unequal constant and undef arrays");
+  assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
   SmallVector<SDValue, 32> Ops;
   bool Split = false;
 
@@ -4637,16 +4637,22 @@ static SDValue getConstVector(ArrayRef<A
   }
 
   MVT EltVT = ConstVecVT.getVectorElementType();
-  for (unsigned i = 0, e = Values.size(); i != e; ++i) {
+  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
     if (Undefs[i]) {
       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
       continue;
     }
-    const APInt &V = Values[i];
+    const APInt &V = Bits[i];
     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
     if (Split) {
       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
+    } else if (EltVT == MVT::f32) {
+      APFloat FV(APFloat::IEEEsingle, V);
+      Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
+    } else if (EltVT == MVT::f64) {
+      APFloat FV(APFloat::IEEEdouble, V);
+      Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
     } else {
       Ops.push_back(DAG.getConstant(V, dl, EltVT));
     }
@@ -5037,6 +5043,77 @@ static const Constant *getTargetConstant
   return dyn_cast<Constant>(CNode->getConstVal());
 }
 
+// Extract constant bits from constant pool vector.
+static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
+                                          SmallBitVector &UndefElts,
+                                          SmallVectorImpl<APInt> &EltBits) {
+  assert(UndefElts.empty() && "Expected an empty UndefElts vector");
+  assert(EltBits.empty() && "Expected an empty EltBits vector");
+
+  EVT VT = Op.getValueType();
+  unsigned SizeInBits = VT.getSizeInBits();
+  assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
+  unsigned NumElts = SizeInBits / EltSizeInBits;
+
+  auto *Cst = getTargetConstantFromNode(Op);
+  if (!Cst)
+    return false;
+
+  Type *CstTy = Cst->getType();
+  if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
+    return false;
+
+  // Extract all the undef/constant element data and pack into single bitsets.
+  APInt UndefBits(SizeInBits, 0);
+  APInt MaskBits(SizeInBits, 0);
+
+  unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
+  for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
+    auto *COp = Cst->getAggregateElement(i);
+    if (!COp ||
+        !(isa<UndefValue>(COp) || isa<ConstantInt>(COp) ||
+          isa<ConstantFP>(COp)))
+      return false;
+
+    if (isa<UndefValue>(COp)) {
+      APInt EltUndef = APInt::getLowBitsSet(SizeInBits, CstEltSizeInBits);
+      UndefBits |= EltUndef.shl(i * CstEltSizeInBits);
+      continue;
+    }
+
+    APInt Bits;
+    if (auto *CInt = dyn_cast<ConstantInt>(COp))
+      Bits = CInt->getValue();
+    else if (auto *CFP = dyn_cast<ConstantFP>(COp))
+      Bits = CFP->getValueAPF().bitcastToAPInt();
+
+    Bits = Bits.zextOrTrunc(SizeInBits);
+    MaskBits |= Bits.shl(i * CstEltSizeInBits);
+  }
+
+  UndefElts = SmallBitVector(NumElts, false);
+  EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
+
+  // Now extract the undef/constant bit data into the target elts.
+  for (unsigned i = 0; i != NumElts; ++i) {
+    APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
+    UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
+
+    // Only treat the element as UNDEF if all bits are UNDEF, otherwise
+    // treat it as zero.
+    if (UndefEltBits.isAllOnesValue()) {
+      UndefElts[i] = true;
+      continue;
+    }
+
+    APInt Bits = MaskBits.lshr(i * EltSizeInBits);
+    Bits = Bits.zextOrTrunc(EltSizeInBits);
+    EltBits[i] = Bits.getZExtValue();
+  }
+
+  return true;
+}
+
 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
                                         unsigned MaskEltSizeInBits,
                                         SmallVectorImpl<uint64_t> &RawMask) {
@@ -26308,6 +26385,93 @@ static bool combineX86ShuffleChain(Array
   return false;
 }
 
+// Attempt to constant fold all of the constant source ops.
+// Returns true if the entire shuffle is folded to a constant.
+// TODO: Extend this to merge multiple constant Ops and update the mask.
+static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
+                                        ArrayRef<int> Mask, SDValue Root,
+                                        bool HasVariableMask, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget &Subtarget) {
+  MVT VT = Root.getSimpleValueType();
+
+  unsigned SizeInBits = VT.getSizeInBits();
+  unsigned NumMaskElts = Mask.size();
+  unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
+  unsigned NumOps = Ops.size();
+
+  // Extract constant bits from each source op.
+  bool OneUseConstantOp = false;
+  SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
+  SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue SrcOp = Ops[i];
+    OneUseConstantOp |= SrcOp.hasOneUse();
+    if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
+                                       RawBitsOps[i]))
+      return false;
+  }
+
+  // Only fold if at least one of the constants is only used once or
+  // the combined shuffle has included a variable mask shuffle, this
+  // is to avoid constant pool bloat.
+  if (!OneUseConstantOp && !HasVariableMask)
+    return false;
+
+  // Shuffle the constant bits according to the mask.
+  SmallBitVector UndefElts(NumMaskElts, false);
+  SmallBitVector ZeroElts(NumMaskElts, false);
+  SmallBitVector ConstantElts(NumMaskElts, false);
+  SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
+                                        APInt::getNullValue(MaskSizeInBits));
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    int M = Mask[i];
+    if (M == SM_SentinelUndef) {
+      UndefElts[i] = true;
+      continue;
+    } else if (M == SM_SentinelZero) {
+      ZeroElts[i] = true;
+      continue;
+    }
+    assert(0 <= M && M < (int)(NumMaskElts * NumOps));
+
+    unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
+    unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
+
+    auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
+    if (SrcUndefElts[SrcMaskIdx]) {
+      UndefElts[i] = true;
+      continue;
+    }
+
+    auto &SrcEltBits = RawBitsOps[SrcOpIdx];
+    APInt &Bits = SrcEltBits[SrcMaskIdx];
+    if (!Bits) {
+      ZeroElts[i] = true;
+      continue;
+    }
+
+    ConstantElts[i] = true;
+    ConstantBitData[i] = Bits;
+  }
+  assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
+
+  // Create the constant data.
+  MVT MaskSVT;
+  if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
+    MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
+  else
+    MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
+
+  MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
+
+  SDLoc DL(Root);
+  SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
+  DCI.AddToWorklist(CstOp.getNode());
+  DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
+  return true;
+}
+
 /// \brief Fully generic combining of x86 shuffle instructions.
 ///
 /// This should be the last combine run over the x86 shuffle instructions. Once
@@ -26491,6 +26655,11 @@ static bool combineX86ShufflesRecursivel
                                         HasVariableMask, DAG, DCI, Subtarget))
         return true;
 
+  // Attempt to constant fold all of the constant source ops.
+  if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
+                                  Subtarget))
+    return true;
+
   // We can only combine unary and binary shuffle mask cases.
   if (Ops.size() > 2)
     return false;

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll?rev=288250&r1=288249&r2=288250&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx.ll Wed Nov 30 10:33:46 2016
@@ -361,12 +361,12 @@ define <4 x float> @combine_vpermilvar_4
 define <2 x double> @constant_fold_vpermilvar_pd() {
 ; X32-LABEL: constant_fold_vpermilvar_pd:
 ; X32:       # BB#0:
-; X32-NEXT:    vpermilpd {{.*#+}} xmm0 = mem[1,0]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_pd:
 ; X64:       # BB#0:
-; X64-NEXT:    vpermilpd {{.*#+}} xmm0 = mem[1,0]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00]
 ; X64-NEXT:    retq
   %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> <double 1.0, double 2.0>, <2 x i64> <i64 2, i64 0>)
   ret <2 x double> %1
@@ -375,12 +375,12 @@ define <2 x double> @constant_fold_vperm
 define <4 x double> @constant_fold_vpermilvar_pd_256() {
 ; X32-LABEL: constant_fold_vpermilvar_pd_256:
 ; X32:       # BB#0:
-; X32-NEXT:    vpermilpd {{.*#+}} ymm0 = mem[1,0,2,3]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_pd_256:
 ; X64:       # BB#0:
-; X64-NEXT:    vpermilpd {{.*#+}} ymm0 = mem[1,0,2,3]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00]
 ; X64-NEXT:    retq
   %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
   ret <4 x double> %1
@@ -389,12 +389,12 @@ define <4 x double> @constant_fold_vperm
 define <4 x float> @constant_fold_vpermilvar_ps() {
 ; X32-LABEL: constant_fold_vpermilvar_ps:
 ; X32:       # BB#0:
-; X32-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,0,2,1]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_ps:
 ; X64:       # BB#0:
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,0,2,1]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00]
 ; X64-NEXT:    retq
   %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x i32> <i32 3, i32 0, i32 2, i32 1>)
   ret <4 x float> %1
@@ -403,14 +403,12 @@ define <4 x float> @constant_fold_vpermi
 define <8 x float> @constant_fold_vpermilvar_ps_256() {
 ; X32-LABEL: constant_fold_vpermilvar_ps_256:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
-; X32-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,5,5]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_ps_256:
 ; X64:       # BB#0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
-; X64-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,5,5]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00]
 ; X64-NEXT:    retq
   %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 1, i32 0, i32 1, i32 1, i32 1>)
   ret <8 x float> %1

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll?rev=288250&r1=288249&r2=288250&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll Wed Nov 30 10:33:46 2016
@@ -685,14 +685,12 @@ define <32 x i8> @combine_psrlq_pshufb(<
 define <8 x i32> @constant_fold_permd() {
 ; X32-LABEL: constant_fold_permd:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0]
-; X32-NEXT:    vpermd {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_permd:
 ; X64:       # BB#0:
-; X64-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0]
-; X64-NEXT:    vpermd {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
 ; X64-NEXT:    retq
   %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
   ret <8 x i32> %1
@@ -701,14 +699,12 @@ define <8 x i32> @constant_fold_permd()
 define <8 x float> @constant_fold_permps() {
 ; X32-LABEL: constant_fold_permps:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0]
-; X32-NEXT:    vpermps {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_permps:
 ; X64:       # BB#0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0]
-; X64-NEXT:    vpermps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00]
 ; X64-NEXT:    retq
   %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
   ret <8 x float> %1
@@ -717,14 +713,12 @@ define <8 x float> @constant_fold_permps
 define <32 x i8> @constant_fold_pshufb_256() {
 ; X32-LABEL: constant_fold_pshufb_256:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241]
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[15],zero,zero,zero,zero,zero,ymm0[7,6,17],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[31],zero,zero,zero,zero,zero,ymm0[23,22]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_pshufb_256:
 ; X64:       # BB#0:
-; X64-NEXT:    vmovdqa {{.*#+}} ymm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241]
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[15],zero,zero,zero,zero,zero,ymm0[7,6,17],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[31],zero,zero,zero,zero,zero,ymm0[23,22]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
 ; X64-NEXT:    retq
   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
   ret <32 x i8> %1

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll?rev=288250&r1=288249&r2=288250&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll Wed Nov 30 10:33:46 2016
@@ -511,14 +511,12 @@ define <16 x i8> @combine_unpckl_arg1_ps
 define <16 x i8> @constant_fold_pshufb() {
 ; SSE-LABEL: constant_fold_pshufb:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[u,u],zero,zero,xmm0[15],zero,zero,zero,zero,zero,xmm0[7,6]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: constant_fold_pshufb:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[u,u],zero,zero,xmm0[15],zero,zero,zero,zero,zero,xmm0[7,6]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
 ; AVX-NEXT:    retq
   %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
   ret <16 x i8> %1

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll?rev=288250&r1=288249&r2=288250&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll Wed Nov 30 10:33:46 2016
@@ -261,14 +261,12 @@ define <4 x i32> @combine_vpperm_10zz32B
 define <2 x double> @constant_fold_vpermil2pd() {
 ; X32-LABEL: constant_fold_vpermil2pd:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovapd {{.*#+}} xmm0 = [-2.000000e+00,-1.000000e+00]
-; X32-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermil2pd:
 ; X64:       # BB#0:
-; X64-NEXT:    vmovapd {{.*#+}} xmm0 = [-2.000000e+00,-1.000000e+00]
-; X64-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
 ; X64-NEXT:    retq
   %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> <double 1.0, double 2.0>, <2 x double> <double -2.0, double -1.0>, <2 x i64> <i64 4, i64 2>, i8 2)
   ret <2 x double> %1
@@ -277,16 +275,12 @@ define <2 x double> @constant_fold_vperm
 define <4 x double> @constant_fold_vpermil2pd_256() {
 ; X32-LABEL: constant_fold_vpermil2pd_256:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovapd {{.*#+}} ymm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
-; X32-NEXT:    vmovapd {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
-; X32-NEXT:    vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3,2]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-4.000000e+00,0.000000e+00,4.000000e+00,3.000000e+00]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermil2pd_256:
 ; X64:       # BB#0:
-; X64-NEXT:    vmovapd {{.*#+}} ymm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
-; X64-NEXT:    vmovapd {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
-; X64-NEXT:    vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3,2]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-4.000000e+00,0.000000e+00,4.000000e+00,3.000000e+00]
 ; X64-NEXT:    retq
   %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x double> <double -4.0, double -3.0, double -2.0, double -1.0>, <4 x i64> <i64 4, i64 8, i64 2, i64 0>, i8 2)
   ret <4 x double> %1
@@ -295,16 +289,12 @@ define <4 x double> @constant_fold_vperm
 define <4 x float> @constant_fold_vpermil2ps() {
 ; X32-LABEL: constant_fold_vpermil2ps:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
-; X32-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
-; X32-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,2],zero
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermil2ps:
 ; X64:       # BB#0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
-; X64-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
-; X64-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,2],zero
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00]
 ; X64-NEXT:    retq
   %1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> <float -4.0, float -3.0, float -2.0, float -1.0>, <4 x i32> <i32 4, i32 0, i32 2, i32 8>, i8 2)
   ret <4 x float> %1
@@ -313,16 +303,12 @@ define <4 x float> @constant_fold_vpermi
 define <8 x float> @constant_fold_vpermil2ps_256() {
 ; X32-LABEL: constant_fold_vpermil2ps_256:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-8.000000e+00,-7.000000e+00,-6.000000e+00,-5.000000e+00,-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
-; X32-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
-; X32-NEXT:    vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0,2],zero,ymm1[4],zero,ymm1[4,6]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-8.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00,5.000000e+00,0.000000e+00,5.000000e+00,7.000000e+00]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermil2ps_256:
 ; X64:       # BB#0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-8.000000e+00,-7.000000e+00,-6.000000e+00,-5.000000e+00,-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
-; X64-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
-; X64-NEXT:    vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0,2],zero,ymm1[4],zero,ymm1[4,6]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-8.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00,5.000000e+00,0.000000e+00,5.000000e+00,7.000000e+00]
 ; X64-NEXT:    retq
   %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x float> <float -8.0, float -7.0, float -6.0, float -5.0, float -4.0, float -3.0, float -2.0, float -1.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 8, i32 0, i32 8, i32 0, i32 2>, i8 2)
   ret <8 x float> %1
@@ -331,16 +317,12 @@ define <8 x float> @constant_fold_vpermi
 define <16 x i8> @constant_fold_vpperm() {
 ; X32-LABEL: constant_fold_vpperm:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241]
-; X32-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpperm:
 ; X64:       # BB#0:
-; X64-NEXT:    vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X64-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241]
-; X64-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; X64-NEXT:    retq
   %1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> <i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16>)
   ret <16 x i8> %1

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll?rev=288250&r1=288249&r2=288250&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll Wed Nov 30 10:33:46 2016
@@ -42,10 +42,8 @@ define void @test1() {
 ; X32-NEXT:    xorps %xmm0, %xmm0
 ; X32-NEXT:    movlps %xmm0, (%esp)
 ; X32-NEXT:    movq (%esp), %mm0
-; X32-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
-; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm1
 ; X32-NEXT:    xorl %edi, %edi
 ; X32-NEXT:    maskmovq %mm1, %mm0
@@ -58,10 +56,8 @@ define void @test1() {
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    movlps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
-; X64-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
-; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq {{.*}}(%rip), %rax
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm1
 ; X64-NEXT:    xorl %edi, %edi
 ; X64-NEXT:    maskmovq %mm1, %mm0

Modified: llvm/trunk/test/CodeGen/X86/vselect-avx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vselect-avx.ll?rev=288250&r1=288249&r2=288250&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vselect-avx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vselect-avx.ll Wed Nov 30 10:33:46 2016
@@ -18,13 +18,10 @@ target datalayout = "e-m:o-i64:64-f80:12
 define void @test(<4 x i16>* %a, <4 x i16>* %b) {
 ; AVX-LABEL: test:
 ; AVX:       ## BB#0: ## %body
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [65533,124,125,14807]
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vmovq %xmm1, (%rdi)
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,65535]
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    movq {{.*}}(%rip), %rax
+; AVX-NEXT:    movq %rax, (%rdi)
+; AVX-NEXT:    movq {{.*}}(%rip), %rax
+; AVX-NEXT:    movq %rax, (%rsi)
 ; AVX-NEXT:    retq
 body:
   %predphi = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -3, i16 545, i16 4385, i16 14807>, <4 x i16> <i16 123, i16 124, i16 125, i16 127>

Modified: llvm/trunk/test/CodeGen/X86/widen_load-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_load-2.ll?rev=288250&r1=288249&r2=288250&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_load-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_load-2.ll Wed Nov 30 10:33:46 2016
@@ -372,15 +372,10 @@ define void @rot(%i8vec3pack* nocapture
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = <158,158,158,u>
-; X86-NEXT:    pshufb %xmm0, %xmm1
-; X86-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; X86-NEXT:    pextrw $0, %xmm1, (%edx)
+; X86-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X86-NEXT:    pextrw $0, %xmm0, (%edx)
 ; X86-NEXT:    movb $-98, 2(%edx)
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = <1,1,1,u>
-; X86-NEXT:    pshufb %xmm0, %xmm1
-; X86-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X86-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
 ; X86-NEXT:    pextrw $0, %xmm0, (%ecx)
 ; X86-NEXT:    movb $1, 2(%ecx)
 ; X86-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -396,15 +391,10 @@ define void @rot(%i8vec3pack* nocapture
 ;
 ; X64-LABEL: rot:
 ; X64:       # BB#0: # %entry
-; X64-NEXT:    movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
-; X64-NEXT:    movdqa {{.*#+}} xmm1 = <158,158,158,u>
-; X64-NEXT:    pshufb %xmm0, %xmm1
-; X64-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; X64-NEXT:    pextrw $0, %xmm1, (%rsi)
+; X64-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X64-NEXT:    pextrw $0, %xmm0, (%rsi)
 ; X64-NEXT:    movb $-98, 2(%rsi)
-; X64-NEXT:    movdqa {{.*#+}} xmm1 = <1,1,1,u>
-; X64-NEXT:    pshufb %xmm0, %xmm1
-; X64-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X64-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
 ; X64-NEXT:    pextrw $0, %xmm0, (%rdx)
 ; X64-NEXT:    movb $1, 2(%rdx)
 ; X64-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero

Modified: llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll?rev=288250&r1=288249&r2=288250&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll Wed Nov 30 10:33:46 2016
@@ -111,16 +111,14 @@ define void @shuf5(<8 x i8>* %p) nounwin
 ; X86-LABEL: shuf5:
 ; X86:       # BB#0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33]
-; X86-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; X86-NEXT:    movq %xmm0, (%eax)
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: shuf5:
 ; X64:       # BB#0:
-; X64-NEXT:    movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33]
-; X64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; X64-NEXT:    movq %xmm0, (%rdi)
+; X64-NEXT:    movq {{.*}}(%rip), %rax
+; X64-NEXT:    movq %rax, (%rdi)
 ; X64-NEXT:    retq
   %v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   store <8 x i8> %v, <8 x i8>* %p, align 8