[llvm] r313348 - [SLPVectorizer] Failure to beneficially vectorize 'copyable' elements in integer binary ops.

Fri Sep 15 03:21:04 PDT 2017

This commit broke buildbots. For one example, see
http://lab.llvm.org:8011/builders/clang-with-thin-lto-ubuntu/builds/5290/.

Reverted by r313352.

On Fri, Sep 15, 2017 at 8:56 AM, Dinar Temirbulatov via llvm-commits <
llvm-commits at lists.llvm.org> wrote:

> Author: dinar
> Date: Thu Sep 14 23:56:39 2017
> New Revision: 313348
>
> URL: http://llvm.org/viewvc/llvm-project?rev=313348&view=rev
> Log:
> [SLPVectorizer] Failure to beneficially vectorize 'copyable' elements in
> integer binary ops.
>
> Patch tries to improve vectorization of the following code:
>
> void add1(int * __restrict dst, const int * __restrict src) {
>   *dst++ = *src++;
>   *dst++ = *src++ + 1;
>   *dst++ = *src++ + 2;
>   *dst++ = *src++ + 3;
> }
> Allows to vectorize even if the very first operation is not a binary add,
> but just a load.
>
> Reviewers: spatel, mzolotukhin, mkuper, hfinkel, RKSimon, filcab, ABataev,
> davide
>
> Subscribers: llvm-commits, RKSimon
>
> Differential Revision: https://reviews.llvm.org/D28907
>
> Modified:
>     llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>     llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_
> copyable_in_binops.ll
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/
> Transforms/Vectorize/SLPVectorizer.cpp?rev=313348&
> r1=313347&r2=313348&view=diff
> ============================================================
> ==================
> --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Thu Sep 14
> 23:56:39 2017
> @@ -332,7 +332,7 @@ static unsigned getAltOpcode(unsigned Op
>    case Instruction::Sub:
>      return Instruction::Add;
>    default:
> -    return 0;
> +    return Op;
>    }
>  }
>
> @@ -345,6 +345,20 @@ static bool sameOpcodeOrAlt(unsigned Opc
>    return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
>  }
>
> +/// Checks if the \p Opcode can be considered as an operand of a
> (possibly)
> +/// binary operation \p I.
> +/// \returns The code of the binary operation of instruction \p I if the
> +/// instruction with \p Opcode can be considered as an operand of \p I
> with the
> +/// default value.
> +static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I)
> {
> +  assert(!sameOpcodeOrAlt(Opcode, getAltOpcode(Opcode), I->getOpcode())
> +           && "Invalid Opcode");
> +  if (Opcode != Instruction::PHI && isa<BinaryOperator>(I) &&
> +      (I->getType()->isIntegerTy() || I->hasUnsafeAlgebra()))
> +    return I->getOpcode();
> +  return 0;
> +}
> +
>  /// Chooses the correct key for scheduling data. If \p Op has the same (or
>  /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key
> is \p
>  /// OpValue.
> @@ -365,6 +379,12 @@ namespace {
>  struct RawInstructionsData {
>    /// Main Opcode of the instructions going to be vectorized.
>    unsigned Opcode = 0;
> +  /// Position of the first instruction with the \a Opcode.
> +  unsigned OpcodePos = 0;
> +  /// Need an additional analysis (if at least one of the instruction is
> not
> +  /// same instruction kind as an instruction at OpcodePos position in the
> +  /// list).
> +  bool NeedAnalysis = false;
>    /// The list of instructions have some instructions with alternate
> opcodes.
>    bool HasAltOpcodes = false;
>  };
> @@ -378,16 +398,38 @@ static RawInstructionsData getMainOpcode
>      return {};
>    RawInstructionsData Res;
>    unsigned Opcode = I0->getOpcode();
> +  unsigned AltOpcode = getAltOpcode(Opcode);
> +  unsigned NewOpcodePos = 0;
>    // Walk through the list of the vectorized instructions
>    // in order to check its structure described by RawInstructionsData.
>    for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
>      auto *I = dyn_cast<Instruction>(VL[Cnt]);
>      if (!I)
>        return {};
> -    if (Opcode != I->getOpcode())
> -      Res.HasAltOpcodes = true;
> +    if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) {
> +      if (Opcode != I->getOpcode()) {
> +        Res.HasAltOpcodes = true;
> +        if (Res.NeedAnalysis && isOdd(NewOpcodePos))
> +          std::swap(Opcode, AltOpcode);
> +      }
> +      continue;
> +    }
> +    if (unsigned NewOpcode = tryToRepresentAsInstArg(Opcode, I)) {
> +      if (!Instruction::isBinaryOp(Opcode) ||
> +          !Instruction::isCommutative(Opcode)) {
> +        NewOpcodePos = Cnt;
> +        Opcode = NewOpcode;
> +        AltOpcode = getAltOpcode(Opcode);
> +        Res.NeedAnalysis = true;
> +      }
> +    } else if (tryToRepresentAsInstArg(I->getOpcode(),
> +                                       cast<Instruction>(VL[
> NewOpcodePos])))
> +      Res.NeedAnalysis = true;
> +    else
> +      return {};
>    }
>    Res.Opcode = Opcode;
> +  Res.OpcodePos = NewOpcodePos;
>    return Res;
>  }
>
> @@ -412,16 +454,20 @@ struct InstructionsState {
>  static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
>    auto Res = getMainOpcode(VL);
>    unsigned Opcode = Res.Opcode;
> -  if (!Res.HasAltOpcodes)
> -    return InstructionsState(VL[0], Opcode, false);
> -  auto *OpInst = cast<Instruction>(VL[0]);
> +  if (!Res.NeedAnalysis && !Res.HasAltOpcodes)
> +    return InstructionsState(VL[Res.OpcodePos], Opcode, false);
> +  auto *OpInst = cast<Instruction>(VL[Res.OpcodePos]);
>    unsigned AltOpcode = getAltOpcode(Opcode);
>    // Examine each element in the list instructions VL to determine
>    // if some operations there could be considered as an alternative
> -  // (for example as subtraction relates to addition operation).
> +  // (for example as subtraction relates to addition operation) or
> +  // operation could be an operand of a (possibly) binary operation.
>    for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
>      auto *I = cast<Instruction>(VL[Cnt]);
>      unsigned InstOpcode = I->getOpcode();
> +    if (Res.NeedAnalysis && !sameOpcodeOrAlt(Opcode, AltOpcode,
> InstOpcode))
> +      if (tryToRepresentAsInstArg(InstOpcode, OpInst))
> +        InstOpcode = (Res.HasAltOpcodes && isOdd(Cnt)) ? AltOpcode :
> Opcode;
>      if ((Res.HasAltOpcodes &&
>           InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
>          (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
> @@ -574,6 +620,7 @@ public:
>    void deleteTree() {
>      VectorizableTree.clear();
>      ScalarToTreeEntry.clear();
> +    ExtraScalarToTreeEntry.clear();
>      MustGather.clear();
>      ExternalUses.clear();
>      NumLoadsWantToKeepOrder = 0;
> @@ -713,22 +760,40 @@ private:
>      /// The TreeEntry index containing the user of this entry.  We can
> actually
>      /// have multiple users so the data structure is not truly a tree.
>      SmallVector<int, 1> UserTreeIndices;
> +
> +    /// Info about instruction in this tree entry.
> +    InstructionsState State;
>    };
>
>    /// Create a new VectorizableTree entry.
>    TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
> -                          int &UserTreeIdx) {
> +                          int &UserTreeIdx, const InstructionsState &S) {
> +    assert((!Vectorized || S.Opcode != 0) &&
> +           "Vectorized TreeEntry without opcode");
>      VectorizableTree.emplace_back(VectorizableTree);
>      int idx = VectorizableTree.size() - 1;
>      TreeEntry *Last = &VectorizableTree[idx];
>      Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
>      Last->NeedToGather = !Vectorized;
>      if (Vectorized) {
> +      Last->State = S;
> +      unsigned AltOpcode = getAltOpcode(S.Opcode);
>        for (int i = 0, e = VL.size(); i != e; ++i) {
> -        assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
> -        ScalarToTreeEntry[VL[i]] = idx;
> +        unsigned RealOpcode =
> +            (S.IsAltShuffle && isOdd(i)) ? AltOpcode : S.Opcode;
> +        Value *Key = (cast<Instruction>(VL[i])->getOpcode() ==
> RealOpcode)
> +                         ? VL[i]
> +                         : S.OpValue;
> +        assert(!getTreeEntry(VL[i], Key) && "Scalar already in tree!");
> +        if (VL[i] == Key)
> +          ScalarToTreeEntry[Key] = idx;
> +        else
> +          ExtraScalarToTreeEntry[VL[i]][Key] = idx;
>        }
>      } else {
> +      Last->State.Opcode = 0;
> +      Last->State.OpValue = VL[0];
> +      Last->State.IsAltShuffle = false;
>        MustGather.insert(VL.begin(), VL.end());
>      }
>
> @@ -756,9 +821,25 @@ private:
>      return nullptr;
>    }
>
> +  TreeEntry *getTreeEntry(Value *V, Value *OpValue) {
> +    if (V == OpValue)
> +      return getTreeEntry(V);
> +    auto I = ExtraScalarToTreeEntry.find(V);
> +    if (I != ExtraScalarToTreeEntry.end()) {
> +      auto &STT = I->second;
> +      auto STTI = STT.find(OpValue);
> +      if (STTI != STT.end())
> +        return &VectorizableTree[STTI->second];
> +    }
> +    return nullptr;
> +  }
> +
>    /// Maps a specific scalar to its tree entry.
>    SmallDenseMap<Value*, int> ScalarToTreeEntry;
>
> +  /// Maps a specific scalar to its tree entry(s) with leading scalar.
> +  SmallDenseMap<Value*, SmallDenseMap<Value*, int>>
> ExtraScalarToTreeEntry;
> +
>    /// A list of scalars that we found that we need to keep as scalars.
>    ValueSet MustGather;
>
> @@ -1327,9 +1408,15 @@ void BoUpSLP::buildTree(ArrayRef<Value *
>        continue;
>
>      // For each lane:
> +    const unsigned Opcode = Entry->State.Opcode;
> +    const unsigned AltOpcode = getAltOpcode(Opcode);
>      for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
>        Value *Scalar = Entry->Scalars[Lane];
>
> +      if (!sameOpcodeOrAlt(Opcode, AltOpcode,
> +                           cast<Instruction>(Scalar)->getOpcode()))
> +        continue;
> +
>        // Check if the scalar is externally used as an extra arg.
>        auto ExtI = ExternallyUsedValues.find(Scalar);
>        if (ExtI != ExternallyUsedValues.end()) {
> @@ -1372,6 +1459,38 @@ void BoUpSLP::buildTree(ArrayRef<Value *
>    }
>  }
>
> +static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) {
> +  switch(Opcode) {
> +  case Instruction::Add:
> +  case Instruction::Sub:
> +  case Instruction::Or:
> +  case Instruction::Xor:
> +    return ConstantInt::getNullValue(Ty);
> +  case Instruction::Mul:
> +  case Instruction::UDiv:
> +  case Instruction::SDiv:
> +  case Instruction::URem:
> +  case Instruction::SRem:
> +    return ConstantInt::get(Ty, /*V=*/1);
> +  case Instruction::FAdd:
> +  case Instruction::FSub:
> +    return ConstantFP::get(Ty, /*V=*/0.0);
> +  case Instruction::FMul:
> +  case Instruction::FDiv:
> +  case Instruction::FRem:
> +    return ConstantFP::get(Ty, /*V=*/1.0);
> +  case Instruction::And:
> +    return ConstantInt::getAllOnesValue(Ty);
> +  case Instruction::Shl:
> +  case Instruction::LShr:
> +  case Instruction::AShr:
> +    return ConstantInt::getNullValue(Type::getInt32Ty(Ty->getContext()));
> +  default:
> +    break;
> +  }
> +  llvm_unreachable("unknown binop for default constant value");
> +}
> +
>  void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
>                              int UserTreeIdx) {
>    assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
> @@ -1379,28 +1498,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>    InstructionsState S = getSameOpcode(VL);
>    if (Depth == RecursionMaxDepth) {
>      DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
> -    newTreeEntry(VL, false, UserTreeIdx);
> +    newTreeEntry(VL, false, UserTreeIdx, S);
>      return;
>    }
>
>    // Don't handle vectors.
>    if (S.OpValue->getType()->isVectorTy()) {
>      DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
> -    newTreeEntry(VL, false, UserTreeIdx);
> +    newTreeEntry(VL, false, UserTreeIdx, S);
>      return;
>    }
>
>    if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
>      if (SI->getValueOperand()->getType()->isVectorTy()) {
>        DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
> -      newTreeEntry(VL, false, UserTreeIdx);
> +      newTreeEntry(VL, false, UserTreeIdx, S);
>        return;
>      }
>
>    // If all of the operands are identical or constant we have a simple
> solution.
>    if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
>      DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
> -    newTreeEntry(VL, false, UserTreeIdx);
> +    newTreeEntry(VL, false, UserTreeIdx, S);
>      return;
>    }
>
> @@ -1412,7 +1531,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>      if (EphValues.count(VL[i])) {
>        DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
>              ") is ephemeral.\n");
> -      newTreeEntry(VL, false, UserTreeIdx);
> +      newTreeEntry(VL, false, UserTreeIdx, S);
>        return;
>      }
>    }
> @@ -1423,7 +1542,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>        DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
>        if (E->Scalars[i] != VL[i]) {
>          DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
> -        newTreeEntry(VL, false, UserTreeIdx);
> +        newTreeEntry(VL, false, UserTreeIdx, S);
>          return;
>        }
>      }
> @@ -1435,14 +1554,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>    }
>
>    // Check that none of the instructions in the bundle are already in the
> tree.
> +  unsigned AltOpcode = getAltOpcode(S.Opcode);
>    for (unsigned i = 0, e = VL.size(); i != e; ++i) {
> +      unsigned RealOpcode = (S.IsAltShuffle && isOdd(i)) ? AltOpcode :
> S.Opcode;
>        auto *I = dyn_cast<Instruction>(VL[i]);
>        if (!I)
>          continue;
> -      if (getTreeEntry(I)) {
> +      Value *Key = (I->getOpcode() == RealOpcode) ? I : S.OpValue;
> +      if (getTreeEntry(I, Key)) {
>        DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
>              ") is already in tree.\n");
> -      newTreeEntry(VL, false, UserTreeIdx);
> +      newTreeEntry(VL, false, UserTreeIdx, S);
>        return;
>      }
>    }
> @@ -1452,7 +1574,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>    for (unsigned i = 0, e = VL.size(); i != e; ++i) {
>      if (MustGather.count(VL[i])) {
>        DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
> -      newTreeEntry(VL, false, UserTreeIdx);
> +      newTreeEntry(VL, false, UserTreeIdx, S);
>        return;
>      }
>    }
> @@ -1466,7 +1588,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>      // Don't go into unreachable blocks. They may contain instructions
> with
>      // dependency cycles which confuse the final scheduling.
>      DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
> -    newTreeEntry(VL, false, UserTreeIdx);
> +    newTreeEntry(VL, false, UserTreeIdx, S);
>      return;
>    }
>
> @@ -1475,7 +1597,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>      for (unsigned j = i+1; j < e; ++j)
>        if (VL[i] == VL[j]) {
>          DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
> -        newTreeEntry(VL, false, UserTreeIdx);
> +        newTreeEntry(VL, false, UserTreeIdx, S);
>          return;
>        }
>
> @@ -1490,7 +1612,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>      assert((!BS.getScheduleData(VL0) ||
>              !BS.getScheduleData(VL0)->isPartOfBundle()) &&
>             "tryScheduleBundle should cancelScheduling on failure");
> -    newTreeEntry(VL, false, UserTreeIdx);
> +    newTreeEntry(VL, false, UserTreeIdx, S);
>      return;
>    }
>    DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
> @@ -1509,12 +1631,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>            if (Term) {
>              DEBUG(dbgs() << "SLP: Need to swizzle PHINodes
> (TerminatorInst use).\n");
>              BS.cancelScheduling(VL, VL0);
> -            newTreeEntry(VL, false, UserTreeIdx);
> +            newTreeEntry(VL, false, UserTreeIdx, S);
>              return;
>            }
>          }
>
> -      newTreeEntry(VL, true, UserTreeIdx);
> +      newTreeEntry(VL, true, UserTreeIdx, S);
>        DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
>
>        for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
> @@ -1536,7 +1658,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>        } else {
>          BS.cancelScheduling(VL, VL0);
>        }
> -      newTreeEntry(VL, Reuse, UserTreeIdx);
> +      newTreeEntry(VL, Reuse, UserTreeIdx, S);
>        return;
>      }
>      case Instruction::Load: {
> @@ -1552,7 +1674,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>        if (DL->getTypeSizeInBits(ScalarTy) !=
>            DL->getTypeAllocSizeInBits(ScalarTy)) {
>          BS.cancelScheduling(VL, VL0);
> -        newTreeEntry(VL, false, UserTreeIdx);
> +        newTreeEntry(VL, false, UserTreeIdx, S);
>          DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
>          return;
>        }
> @@ -1563,7 +1685,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>          LoadInst *L = cast<LoadInst>(VL[i]);
>          if (!L->isSimple()) {
>            BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx);
> +          newTreeEntry(VL, false, UserTreeIdx, S);
>            DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
>            return;
>          }
> @@ -1585,7 +1707,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>
>        if (Consecutive) {
>          ++NumLoadsWantToKeepOrder;
> -        newTreeEntry(VL, true, UserTreeIdx);
> +        newTreeEntry(VL, true, UserTreeIdx, S);
>          DEBUG(dbgs() << "SLP: added a vector of loads.\n");
>          return;
>        }
> @@ -1600,7 +1722,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>            }
>
>        BS.cancelScheduling(VL, VL0);
> -      newTreeEntry(VL, false, UserTreeIdx);
> +      newTreeEntry(VL, false, UserTreeIdx, S);
>
>        if (ReverseConsecutive) {
>          ++NumLoadsWantToChangeOrder;
> @@ -1627,12 +1749,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>          Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
>          if (Ty != SrcTy || !isValidElementType(Ty)) {
>            BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx);
> +          newTreeEntry(VL, false, UserTreeIdx, S);
>            DEBUG(dbgs() << "SLP: Gathering casts with different src
> types.\n");
>            return;
>          }
>        }
> -      newTreeEntry(VL, true, UserTreeIdx);
> +      newTreeEntry(VL, true, UserTreeIdx, S);
>        DEBUG(dbgs() << "SLP: added a vector of casts.\n");
>
>        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
> @@ -1655,13 +1777,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>          if (Cmp->getPredicate() != P0 ||
>              Cmp->getOperand(0)->getType() != ComparedTy) {
>            BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx);
> +          newTreeEntry(VL, false, UserTreeIdx, S);
>            DEBUG(dbgs() << "SLP: Gathering cmp with different
> predicate.\n");
>            return;
>          }
>        }
>
> -      newTreeEntry(VL, true, UserTreeIdx);
> +      newTreeEntry(VL, true, UserTreeIdx, S);
>        DEBUG(dbgs() << "SLP: added a vector of compares.\n");
>
>        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
> @@ -1693,7 +1815,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>      case Instruction::And:
>      case Instruction::Or:
>      case Instruction::Xor:
> -      newTreeEntry(VL, true, UserTreeIdx);
> +      newTreeEntry(VL, true, UserTreeIdx, S);
>        DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
>
>        // Sort operands of the instructions so that each side is more
> likely to
> @@ -1709,8 +1831,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
>          ValueList Operands;
>          // Prepare the operand vector.
> -        for (Value *j : VL)
> -          Operands.push_back(cast<Instruction>(j)->getOperand(i));
> +        for (Value *VecOp : VL) {
> +          auto *I = cast<Instruction>(VecOp);
> +          if (I->getOpcode() == S.Opcode) {
> +             Operands.push_back(I->getOperand(i));
> +             continue;
> +          }
> +          assert(Instruction::isBinaryOp(S.Opcode) &&
> +                  "Expected a binary operation.");
> +          Value *Operand = isOdd(i)
> +                        ? getDefaultConstantForOpcode(S.Opcode,
> I->getType())
> +                        : VecOp;
> +          Operands.push_back(Operand);
> +        }
>
>          buildTree_rec(Operands, Depth + 1, UserTreeIdx);
>        }
> @@ -1722,7 +1855,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>          if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
>            DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested
> indexes).\n");
>            BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx);
> +          newTreeEntry(VL, false, UserTreeIdx, S);
>            return;
>          }
>        }
> @@ -1735,7 +1868,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>          if (Ty0 != CurTy) {
>            DEBUG(dbgs() << "SLP: not-vectorizable GEP (different
> types).\n");
>            BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx);
> +          newTreeEntry(VL, false, UserTreeIdx, S);
>            return;
>          }
>        }
> @@ -1747,12 +1880,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>            DEBUG(
>                dbgs() << "SLP: not-vectorizable GEP (non-constant
> indexes).\n");
>            BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx);
> +          newTreeEntry(VL, false, UserTreeIdx, S);
>            return;
>          }
>        }
>
> -      newTreeEntry(VL, true, UserTreeIdx);
> +      newTreeEntry(VL, true, UserTreeIdx, S);
>        DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
>        for (unsigned i = 0, e = 2; i < e; ++i) {
>          ValueList Operands;
> @@ -1769,12 +1902,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>        for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
>          if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
>            BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx);
> +          newTreeEntry(VL, false, UserTreeIdx, S);
>            DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
>            return;
>          }
>
> -      newTreeEntry(VL, true, UserTreeIdx);
> +      newTreeEntry(VL, true, UserTreeIdx, S);
>        DEBUG(dbgs() << "SLP: added a vector of stores.\n");
>
>        ValueList Operands;
> @@ -1792,7 +1925,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>        Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
>        if (!isTriviallyVectorizable(ID)) {
>          BS.cancelScheduling(VL, VL0);
> -        newTreeEntry(VL, false, UserTreeIdx);
> +        newTreeEntry(VL, false, UserTreeIdx, S);
>          DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
>          return;
>        }
> @@ -1806,7 +1939,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>              getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
>              !CI->hasIdenticalOperandBundleSchema(*CI2)) {
>            BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx);
> +          newTreeEntry(VL, false, UserTreeIdx, S);
>            DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" <<
> *VL[i]
>                         << "\n");
>            return;
> @@ -1817,7 +1950,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>            Value *A1J = CI2->getArgOperand(1);
>            if (A1I != A1J) {
>              BS.cancelScheduling(VL, VL0);
> -            newTreeEntry(VL, false, UserTreeIdx);
> +            newTreeEntry(VL, false, UserTreeIdx, S);
>              DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
>                           << " argument "<< A1I<<"!=" << A1J
>                           << "\n");
> @@ -1830,14 +1963,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>                          CI->op_begin() + CI->getBundleOperandsEndIndex(),
>                          CI2->op_begin() + CI2->
> getBundleOperandsStartIndex())) {
>            BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx);
> +          newTreeEntry(VL, false, UserTreeIdx, S);
>            DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" <<
> *CI << "!="
>                         << *VL[i] << '\n');
>            return;
>          }
>        }
>
> -      newTreeEntry(VL, true, UserTreeIdx);
> +      newTreeEntry(VL, true, UserTreeIdx, S);
>        for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
>          ValueList Operands;
>          // Prepare the operand vector.
> @@ -1854,11 +1987,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>        // then do not vectorize this instruction.
>        if (!S.IsAltShuffle) {
>          BS.cancelScheduling(VL, VL0);
> -        newTreeEntry(VL, false, UserTreeIdx);
> +        newTreeEntry(VL, false, UserTreeIdx, S);
>          DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
>          return;
>        }
> -      newTreeEntry(VL, true, UserTreeIdx);
> +      newTreeEntry(VL, true, UserTreeIdx, S);
>        DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
>
>        // Reorder operands if reordering would enable vectorization.
> @@ -1873,8 +2006,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
>          ValueList Operands;
>          // Prepare the operand vector.
> -        for (Value *j : VL)
> -          Operands.push_back(cast<Instruction>(j)->getOperand(i));
> +        for (Value *VecOp : VL) {
> +          auto *I = cast<Instruction>(VecOp);
> +          if (sameOpcodeOrAlt(S.Opcode, AltOpcode, I->getOpcode())) {
> +            Operands.push_back(I->getOperand(i));
> +            continue;
> +          }
> +          assert(Instruction::isBinaryOp(S.Opcode) &&
> +                  "Expected a binary operation.");
> +          Value *Operand = isOdd(i)
> +                        ? getDefaultConstantForOpcode(S.Opcode,
> I->getType())
> +                        : VecOp;
> +          Operands.push_back(Operand);
> +        }
>
>          buildTree_rec(Operands, Depth + 1, UserTreeIdx);
>        }
> @@ -1882,7 +2026,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>
>      default:
>        BS.cancelScheduling(VL, VL0);
> -      newTreeEntry(VL, false, UserTreeIdx);
> +      newTreeEntry(VL, false, UserTreeIdx, S);
>        DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
>        return;
>    }
> @@ -2003,18 +2147,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>      }
>      return getGatherCost(E->Scalars);
>    }
> -  InstructionsState S = getSameOpcode(VL);
> -  assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
> -  Instruction *VL0 = cast<Instruction>(S.OpValue);
> -  unsigned ShuffleOrOp = S.IsAltShuffle ?
> -               (unsigned) Instruction::ShuffleVector : S.Opcode;
> +  assert(E->State.Opcode && allSameType(VL) && allSameBlock(VL) &&
> "Invalid VL");
> +  auto *VL0 = cast<Instruction>(E->State.OpValue);
> +  unsigned ShuffleOrOp = E->State.IsAltShuffle ?
> +               (unsigned) Instruction::ShuffleVector : E->State.Opcode;
>    switch (ShuffleOrOp) {
>      case Instruction::PHI:
>        return 0;
>
>      case Instruction::ExtractValue:
>      case Instruction::ExtractElement:
> -      if (canReuseExtract(VL, S.OpValue)) {
> +      if (canReuseExtract(VL, E->State.OpValue)) {
>          int DeadCost = 0;
>          for (unsigned i = 0, e = VL.size(); i < e; ++i) {
>            Instruction *E = cast<Instruction>(VL[i]);
> @@ -2058,8 +2201,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>        // Calculate the cost of this instruction.
>        VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(),
> VL.size());
>        int ScalarCost = VecTy->getNumElements() *
> -          TTI->getCmpSelInstrCost(S.Opcode, ScalarTy,
> Builder.getInt1Ty(), VL0);
> -      int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy,
> VL0);
> +          TTI->getCmpSelInstrCost(ShuffleOrOp, ScalarTy,
> Builder.getInt1Ty(), VL0);
> +      int VecCost = TTI->getCmpSelInstrCost(ShuffleOrOp, VecTy, MaskTy,
> VL0);
>        return VecCost - ScalarCost;
>      }
>      case Instruction::Add:
> @@ -2085,7 +2228,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>        TargetTransformInfo::OperandValueKind Op1VK =
>            TargetTransformInfo::OK_AnyValue;
>        TargetTransformInfo::OperandValueKind Op2VK =
> -          TargetTransformInfo::OK_UniformConstantValue;
> +          TargetTransformInfo::OK_AnyValue;
>        TargetTransformInfo::OperandValueProperties Op1VP =
>            TargetTransformInfo::OP_None;
>        TargetTransformInfo::OperandValueProperties Op2VP =
> @@ -2096,34 +2239,33 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>        // If instead not all operands are constants, then set the operand
> kind
>        // to OK_AnyValue. If all operands are constants but not the same,
>        // then set the operand kind to OK_NonUniformConstantValue.
> -      ConstantInt *CInt = nullptr;
> -      for (unsigned i = 0; i < VL.size(); ++i) {
> -        const Instruction *I = cast<Instruction>(VL[i]);
> -        if (!isa<ConstantInt>(I->getOperand(1))) {
> -          Op2VK = TargetTransformInfo::OK_AnyValue;
> -          break;
> -        }
> -        if (i == 0) {
> -          CInt = cast<ConstantInt>(I->getOperand(1));
> -          continue;
> +      if (auto *CInt = dyn_cast<ConstantInt>(VL0->getOperand(1))) {
> +        Op2VK = TargetTransformInfo::OK_UniformConstantValue;
> +        const unsigned Opcode = E->State.Opcode;
> +        for (auto *V : VL) {
> +          auto *I = cast<Instruction>(V);
> +          if (I == VL0 || Opcode != I->getOpcode())
> +            continue;
> +          if (!isa<ConstantInt>(I->getOperand(1))) {
> +            Op2VK = TargetTransformInfo::OK_AnyValue;
> +            break;
> +          }
> +          if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
> +              CInt != cast<ConstantInt>(I->getOperand(1)))
> +            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
>          }
> +        // FIXME: Currently cost of model modification for division by
> power of
> +        // 2 is handled for X86 and AArch64. Add support for other
> targets.
>          if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
> -            CInt != cast<ConstantInt>(I->getOperand(1)))
> -          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
> +            CInt->getValue().isPowerOf2())
> +          Op2VP = TargetTransformInfo::OP_PowerOf2;
>        }
> -      // FIXME: Currently cost of model modification for division by
> power of
> -      // 2 is handled for X86 and AArch64. Add support for other targets.
> -      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt
> &&
> -          CInt->getValue().isPowerOf2())
> -        Op2VP = TargetTransformInfo::OP_PowerOf2;
>
> -      SmallVector<const Value *, 4> Operands(VL0->operand_values());
> -      int ScalarCost =
> -          VecTy->getNumElements() *
> -          TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK,
> Op1VP,
> -                                      Op2VP, Operands);
> -      int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK,
> Op2VK,
> -                                                Op1VP, Op2VP, Operands);
> +      int ScalarCost = VecTy->getNumElements() *
> +                       TTI->getArithmeticInstrCost(E->State.Opcode,
> ScalarTy,
> +                                                   Op1VK, Op2VK, Op1VP,
> Op2VP);
> +      int VecCost = TTI->getArithmeticInstrCost(E->State.Opcode, VecTy,
> Op1VK,
> +                                                Op2VK, Op1VP, Op2VP);
>        return VecCost - ScalarCost;
>      }
>      case Instruction::GetElementPtr: {
> @@ -2189,23 +2331,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>            TargetTransformInfo::OK_AnyValue;
>        TargetTransformInfo::OperandValueKind Op2VK =
>            TargetTransformInfo::OK_AnyValue;
> -      int ScalarCost = 0;
> -      int VecCost = 0;
> -      for (Value *i : VL) {
> -        Instruction *I = cast<Instruction>(i);
> -        if (!I)
> -          break;
> -        ScalarCost +=
> -            TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK,
> Op2VK);
> -      }
> +      unsigned AltOpcode = getAltOpcode(E->State.Opcode);
> +      int ScalarCost =
> +          TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy, Op1VK,
> Op2VK) *
> +          VL.size() / 2;
> +      ScalarCost +=
> +          TTI->getArithmeticInstrCost(AltOpcode, ScalarTy, Op1VK, Op2VK)
> *
> +          VL.size() / 2;
>        // VecCost is equal to sum of the cost of creating 2 vectors
>        // and the cost of creating shuffle.
> -      Instruction *I0 = cast<Instruction>(VL[0]);
> -      VecCost =
> -          TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK,
> Op2VK);
> -      Instruction *I1 = cast<Instruction>(VL[1]);
> -      VecCost +=
> -          TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK,
> Op2VK);
> +      int VecCost =
> +          TTI->getArithmeticInstrCost(E->State.Opcode, VecTy, Op1VK,
> Op2VK);
> +      VecCost += TTI->getArithmeticInstrCost(AltOpcode, VecTy, Op1VK,
> Op2VK);
>        VecCost +=
>            TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy,
> 0);
>        return VecCost - ScalarCost;
> @@ -2271,7 +2408,7 @@ int BoUpSLP::getSpillCost() {
>    Instruction *PrevInst = nullptr;
>
>    for (const auto &N : VectorizableTree) {
> -    Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
> +    Instruction *Inst = dyn_cast<Instruction>(N.State.OpValue);
>      if (!Inst)
>        continue;
>
> @@ -2331,7 +2468,7 @@ int BoUpSLP::getTreeCost() {
>    for (TreeEntry &TE : VectorizableTree) {
>      int C = getEntryCost(&TE);
>      DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts
> with "
> -                 << *TE.Scalars[0] << ".\n");
> +                 << *TE.State.OpValue << ".\n");
>      Cost += C;
>    }
>
> @@ -2352,7 +2489,7 @@ int BoUpSLP::getTreeCost() {
>      // extend the extracted value back to the original type. Here, we
> account
>      // for the extract and the added cost of the sign extend if needed.
>      auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
> -    auto *ScalarRoot = VectorizableTree[0].Scalars[0];
> +    auto *ScalarRoot = VectorizableTree[0].State.OpValue;
>      if (MinBWs.count(ScalarRoot)) {
>        auto *MinTy = IntegerType::get(F->getContext(),
> MinBWs[ScalarRoot].first);
>        auto Extend =
> @@ -2415,13 +2552,15 @@ void BoUpSLP::reorderAltShuffleOperands(
>                                          SmallVectorImpl<Value *> &Right) {
>    // Push left and right operands of binary operation into Left and Right
>    unsigned AltOpcode = getAltOpcode(Opcode);
> -  (void)AltOpcode;
>    for (Value *V : VL) {
>      auto *I = cast<Instruction>(V);
> -    assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&
> -           "Incorrect instruction in vector");
> -    Left.push_back(I->getOperand(0));
> -    Right.push_back(I->getOperand(1));
> +    if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) {
> +      Left.push_back(I->getOperand(0));
> +      Right.push_back(I->getOperand(1));
> +    } else {
> +      Left.push_back(I);
> +      Right.push_back(getDefaultConstantForOpcode(Opcode, I->getType()));
> +    }
>    }
>
>    // Reorder if we have a commutative operation and consecutive access
> @@ -2470,8 +2609,13 @@ static bool shouldReorderOperands(
>      int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
>      ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool
> AllSameOpcodeRight,
>      bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
> -  VLeft = I.getOperand(0);
> -  VRight = I.getOperand(1);
> +  if (I.getOpcode() == Opcode) {
> +    VLeft = I.getOperand(0);
> +    VRight = I.getOperand(1);
> +  } else {
> +    VLeft = &I;
> +    VRight = getDefaultConstantForOpcode(Opcode, I.getType());
> +  }
>    // If we have "SplatRight", try to see if commuting is needed to
> preserve it.
>    if (SplatRight) {
>      if (VRight == Right[i - 1])
> @@ -2535,8 +2679,15 @@ void BoUpSLP::reorderInputsAccordingToOp
>      // Peel the first iteration out of the loop since there's nothing
>      // interesting to do anyway and it simplifies the checks in the loop.
>      auto *I = cast<Instruction>(VL[0]);
> -    Value *VLeft = I->getOperand(0);
> -    Value *VRight = I->getOperand(1);
> +    Value *VLeft;
> +    Value *VRight;
> +    if (I->getOpcode() == Opcode) {
> +      VLeft = I->getOperand(0);
> +      VRight = I->getOperand(1);
> +    } else {
> +      VLeft = I;
> +      VRight = getDefaultConstantForOpcode(Opcode, I->getType());
> +    }
>      if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
>        // Favor having instruction to the right. FIXME: why?
>        std::swap(VLeft, VRight);
> @@ -2741,12 +2892,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>    IRBuilder<>::InsertPointGuard Guard(Builder);
>
>    if (E->VectorizedValue) {
> -    DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] <<
> ".\n");
> +    DEBUG(dbgs() << "SLP: Diamond merged for " << *E->State.OpValue <<
> ".\n");
>      return E->VectorizedValue;
>    }
>
> -  InstructionsState S = getSameOpcode(E->Scalars);
> -  Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
> +  Instruction *VL0 = cast<Instruction>(E->State.OpValue);
>    Type *ScalarTy = VL0->getType();
>    if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
>      ScalarTy = SI->getValueOperand()->getType();
> @@ -2759,8 +2909,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>      return V;
>    }
>
> -  unsigned ShuffleOrOp = S.IsAltShuffle ?
> -           (unsigned) Instruction::ShuffleVector : S.Opcode;
> +  unsigned ShuffleOrOp = E->State.IsAltShuffle ?
> +           (unsigned) Instruction::ShuffleVector : E->State.Opcode;
>    switch (ShuffleOrOp) {
>      case Instruction::PHI: {
>        PHINode *PH = dyn_cast<PHINode>(VL0);
> @@ -2870,7 +3020,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>
>        CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
>        Value *V;
> -      if (S.Opcode == Instruction::FCmp)
> +      if (E->State.Opcode == Instruction::FCmp)
>          V = Builder.CreateFCmp(P0, L, R);
>        else
>          V = Builder.CreateICmp(P0, L, R);
> @@ -2922,13 +3072,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>      case Instruction::Xor: {
>        ValueList LHSVL, RHSVL;
>        if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
> -        reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
> +        reorderInputsAccordingToOpcode(E->State.Opcode, E->Scalars,
> LHSVL,
>                                         RHSVL);
>        else
>          for (Value *V : E->Scalars) {
>            auto *I = cast<Instruction>(V);
> -          LHSVL.push_back(I->getOperand(0));
> -          RHSVL.push_back(I->getOperand(1));
> +          if (I->getOpcode() == E->State.Opcode) {
> +            LHSVL.push_back(I->getOperand(0));
> +            RHSVL.push_back(I->getOperand(1));
> +          } else {
> +            LHSVL.push_back(V);
> +            RHSVL.push_back(
> +                getDefaultConstantForOpcode(E->State.Opcode,
> I->getType()));
> +          }
>          }
>
>        setInsertPointAfterBundle(E->Scalars, VL0);
> @@ -2940,7 +3096,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>          return V;
>
>        Value *V = Builder.CreateBinOp(
> -          static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
> +          static_cast<Instruction::BinaryOps>(E->State.Opcode), LHS,
> RHS);
>        E->VectorizedValue = V;
>        propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
>        ++NumVectorInstructions;
> @@ -3091,9 +3247,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>      }
>      case Instruction::ShuffleVector: {
>        ValueList LHSVL, RHSVL;
> -      assert(Instruction::isBinaryOp(S.Opcode) &&
> +      assert(Instruction::isBinaryOp(E->State.Opcode) &&
>               "Invalid Shuffle Vector Operand");
> -      reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
> +      reorderAltShuffleOperands(E->State.Opcode, E->Scalars, LHSVL,
> RHSVL);
>        setInsertPointAfterBundle(E->Scalars, VL0);
>
>        Value *LHS = vectorizeTree(LHSVL);
> @@ -3104,9 +3260,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>
>        // Create a vector of LHS op1 RHS
>        Value *V0 = Builder.CreateBinOp(
> -          static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
> +          static_cast<Instruction::BinaryOps>(E->State.Opcode), LHS,
> RHS);
>
> -      unsigned AltOpcode = getAltOpcode(S.Opcode);
> +      unsigned AltOpcode = getAltOpcode(E->State.Opcode);
>        // Create a vector of LHS op2 RHS
>        Value *V1 = Builder.CreateBinOp(
>            static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);
> @@ -3128,8 +3284,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>        }
>
>        Value *ShuffleMask = ConstantVector::get(Mask);
> -      propagateIRFlags(V0, EvenScalars);
> -      propagateIRFlags(V1, OddScalars);
> +      InstructionsState S = getSameOpcode(EvenScalars);
> +      assert(!S.IsAltShuffle && "Unexpected alternate opcode");
> +      propagateIRFlags(V0, EvenScalars, S.OpValue);
> +
> +      S = getSameOpcode(OddScalars);
> +      assert(!S.IsAltShuffle && "Unexpected alternate opcode");
> +      propagateIRFlags(V1, OddScalars, S.OpValue);
>
>        Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
>        E->VectorizedValue = V;
> @@ -3163,7 +3324,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebug
>    // If the vectorized tree can be rewritten in a smaller type, we
> truncate the
>    // vectorized root. InstCombine will then rewrite the entire
> expression. We
>    // sign extend the extracted values below.
> -  auto *ScalarRoot = VectorizableTree[0].Scalars[0];
> +  auto *ScalarRoot = VectorizableTree[0].State.OpValue;
>    if (MinBWs.count(ScalarRoot)) {
>      if (auto *I = dyn_cast<Instruction>(VectorRoot))
>        Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
> @@ -3274,9 +3435,15 @@ BoUpSLP::vectorizeTree(ExtraValueToDebug
>      assert(Entry->VectorizedValue && "Can't find vectorizable value");
>
>      // For each lane:
> +    const unsigned Opcode = Entry->State.Opcode;
> +    const unsigned AltOpcode = getAltOpcode(Opcode);
>      for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
>        Value *Scalar = Entry->Scalars[Lane];
>
> +      if (!sameOpcodeOrAlt(Opcode, AltOpcode,
> +                           cast<Instruction>(Scalar)->getOpcode()))
> +        continue;
> +
>        Type *Ty = Scalar->getType();
>        if (!Ty->isVoidTy()) {
>  #ifndef NDEBUG
> @@ -3408,7 +3575,7 @@ bool BoUpSLP::BlockScheduling::trySchedu
>    }
>
>    for (Value *V : VL) {
> -    ScheduleData *BundleMember = getScheduleData(V);
> +    ScheduleData *BundleMember = getScheduleData(V, isOneOf(OpValue, V));
>      assert(BundleMember &&
>             "no ScheduleData for bundle member (maybe not in same basic
> block)");
>      if (BundleMember->IsScheduled) {
> @@ -3481,7 +3648,7 @@ void BoUpSLP::BlockScheduling::cancelSch
>    if (isa<PHINode>(OpValue))
>      return;
>
> -  ScheduleData *Bundle = getScheduleData(OpValue);
> +  ScheduleData *Bundle = getScheduleData(OpValue)->FirstInBundle;
>    DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
>    assert(!Bundle->IsScheduled &&
>           "Can't cancel bundle which is already scheduled");
> @@ -3784,7 +3951,7 @@ void BoUpSLP::scheduleBlock(BlockSchedul
>         I = I->getNextNode()) {
>      BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData
> *SD) {
>        assert(SD->isPartOfBundle() ==
> -                 (getTreeEntry(SD->Inst) != nullptr) &&
> +                 (getTreeEntry(SD->Inst, SD->OpValue) != nullptr) &&
>               "scheduler and vectorizer bundle mismatch");
>        SD->FirstInBundle->SchedulingPriority = Idx++;
>        if (SD->isSchedulingEntity()) {
> @@ -3807,15 +3974,15 @@ void BoUpSLP::scheduleBlock(BlockSchedul
>      ScheduleData *BundleMember = picked;
>      while (BundleMember) {
>        Instruction *pickedInst = BundleMember->Inst;
> -      if (LastScheduledInst->getNextNode() != pickedInst) {
> -        BS->BB->getInstList().remove(pickedInst);
> -        BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
> -                                     pickedInst);
> +      if (pickedInst == BundleMember->OpValue) {
> +        if (LastScheduledInst->getNextNode() != pickedInst) {
> +          BS->BB->getInstList().remove(pickedInst);
> +          BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
> pickedInst);
> +        }
> +        LastScheduledInst = pickedInst;
>        }
> -      LastScheduledInst = pickedInst;
>        BundleMember = BundleMember->NextInBundle;
>      }
> -
>      BS->schedule(picked, ReadyInsts);
>      NumToSchedule--;
>    }
> @@ -5146,7 +5313,9 @@ public:
>                                          VectorizedTree, ReducedSubTree,
>                                          ReductionData.getKind());
>          VectorizedTree = VectReductionData.createOp(Builder, "op.rdx");
> -        propagateIRFlags(VectorizedTree, ReductionOps);
> +        InstructionsState S = getSameOpcode(ReductionOps);
> +        assert(!S.IsAltShuffle && "Unexpected alternate opcode");
> +        propagateIRFlags(VectorizedTree, ReductionOps, S.OpValue);
>        } else
>          VectorizedTree = ReducedSubTree;
>        i += ReduxWidth;
> @@ -5162,7 +5331,9 @@ public:
>                                          VectorizedTree, I,
>                                          ReductionData.getKind());
>          VectorizedTree = VectReductionData.createOp(Builder);
> -        propagateIRFlags(VectorizedTree, ReductionOps);
> +        InstructionsState S = getSameOpcode(ReductionOps);
> +        assert(!S.IsAltShuffle && "Unexpected alternate opcode");
> +        propagateIRFlags(VectorizedTree, ReductionOps, S.OpValue);
>        }
>        for (auto &Pair : ExternallyUsedValues) {
>          assert(!Pair.second.empty() &&
> @@ -5174,7 +5345,9 @@ public:
>                                            VectorizedTree, Pair.first,
>                                            ReductionData.getKind());
>            VectorizedTree = VectReductionData.createOp(Builder,
> "op.extra");
> -          propagateIRFlags(VectorizedTree, I);
> +          InstructionsState S = getSameOpcode(I);
> +          assert(!S.IsAltShuffle && "Unexpected alternate opcode");
> +          propagateIRFlags(VectorizedTree, I, S.OpValue);
>          }
>        }
>        // Update users.
> @@ -5284,7 +5457,9 @@ private:
>        OperationData VectReductionData(ReductionData.getOpcode(),
> LeftShuf,
>                                        RightShuf, ReductionData.getKind());
>        TmpVec = VectReductionData.createOp(Builder, "op.rdx");
> -      propagateIRFlags(TmpVec, RedOps);
> +      InstructionsState S = getSameOpcode(RedOps);
> +      assert(!S.IsAltShuffle && "Unexpected alternate opcode");
> +      propagateIRFlags(TmpVec, RedOps, S.OpValue);
>      }
>
>      // The result is in the first element of the vector.
>
> Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_
> copyable_in_binops.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/
> Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll?
> rev=313348&r1=313347&r2=313348&view=diff
> ============================================================
> ==================
> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
> (original)
> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
> Thu Sep 14 23:56:39 2017
> @@ -43,22 +43,16 @@ define void @add1(i32* noalias %dst, i32
>  ; CHECK-LABEL: @add1(
>  ; CHECK-NEXT:  entry:
>  ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32*
> [[SRC:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32*
> [[DST:%.*]], i64 1
> -; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32*
> [[SRC]], i64 2
> -; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
> -; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
>  ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32*
> [[DST]], i64 2
> -; CHECK-NEXT:    store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32*
> [[SRC]], i64 3
> -; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
> -; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
>  ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32*
> [[DST]], i64 3
> -; CHECK-NEXT:    store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
> -; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
> -; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
> -; CHECK-NEXT:    store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]],
> align 4
> +; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 0, i32 1, i32 2,
> i32 3>, [[TMP1]]
> +; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
> +; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -86,22 +80,16 @@ define void @sub0(i32* noalias %dst, i32
>  ; CHECK-LABEL: @sub0(
>  ; CHECK-NEXT:  entry:
>  ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32*
> [[SRC:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
> -; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
>  ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32*
> [[DST:%.*]], i64 1
> -; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32*
> [[SRC]], i64 2
> -; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32*
> [[DST]], i64 2
> -; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32*
> [[SRC]], i64 3
> -; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
> -; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
>  ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32*
> [[DST]], i64 3
> -; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
> -; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
> -; CHECK-NEXT:    [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3
> -; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]],
> align 4
> +; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 -1, i32 0, i32 -2,
> i32 -3>, [[TMP1]]
> +; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
> +; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -205,22 +193,18 @@ define void @addsub0(i32* noalias %dst,
>  ; CHECK-LABEL: @addsub0(
>  ; CHECK-NEXT:  entry:
>  ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32*
> [[SRC:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
> -; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
>  ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32*
> [[DST:%.*]], i64 1
> -; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32*
> [[SRC]], i64 2
> -; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32*
> [[DST]], i64 2
> -; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32*
> [[SRC]], i64 3
> -; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
> -; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
>  ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32*
> [[DST]], i64 3
> -; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
> -; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
> -; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
> -; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]],
> align 4
> +; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32
> 0, i32 -2, i32 -3>
> +; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], <i32 -1, i32
> 0, i32 -2, i32 -3>
> +; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x
> i32> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
> +; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
> +; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -248,22 +232,18 @@ define void @addsub1(i32* noalias %dst,
>  ; CHECK-LABEL: @addsub1(
>  ; CHECK-NEXT:  entry:
>  ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32*
> [[SRC:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
> -; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
>  ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32*
> [[DST:%.*]], i64 1
> -; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32*
> [[SRC]], i64 2
> -; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
> -; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1
>  ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32*
> [[DST]], i64 2
> -; CHECK-NEXT:    store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32*
> [[SRC]], i64 3
> -; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32*
> [[DST]], i64 3
> -; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4
> -; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
> -; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
> -; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]],
> align 4
> +; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32
> -1, i32 0, i32 -3>
> +; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], <i32 -1, i32
> -1, i32 0, i32 -3>
> +; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x
> i32> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
> +; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
> +; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -291,22 +271,16 @@ define void @mul(i32* noalias %dst, i32*
>  ; CHECK-LABEL: @mul(
>  ; CHECK-NEXT:  entry:
>  ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32*
> [[SRC:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
> -; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257
>  ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32*
> [[DST:%.*]], i64 1
> -; CHECK-NEXT:    store i32 [[MUL]], i32* [[DST]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32*
> [[SRC]], i64 2
> -; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
> -; CHECK-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3
>  ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32*
> [[DST]], i64 2
> -; CHECK-NEXT:    store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32*
> [[SRC]], i64 3
> -; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32*
> [[DST]], i64 3
> -; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4
> -; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
> -; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
> -; CHECK-NEXT:    store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]],
> align 4
> +; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> <i32 257, i32 -3, i32
> 1, i32 -9>, [[TMP1]]
> +; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
> +; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -334,22 +308,16 @@ define void @shl0(i32* noalias %dst, i32
>  ; CHECK-LABEL: @shl0(
>  ; CHECK-NEXT:  entry:
>  ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32*
> [[SRC:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32*
> [[DST:%.*]], i64 1
> -; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32*
> [[SRC]], i64 2
> -; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
> -; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP1]], 1
>  ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32*
> [[DST]], i64 2
> -; CHECK-NEXT:    store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32*
> [[SRC]], i64 3
> -; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
> -; CHECK-NEXT:    [[SHL5:%.*]] = shl i32 [[TMP2]], 2
>  ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32*
> [[DST]], i64 3
> -; CHECK-NEXT:    store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
> -; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
> -; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
> -; CHECK-NEXT:    store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]],
> align 4
> +; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], <i32 0, i32 1,
> i32 2, i32 3>
> +; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
> +; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -453,22 +421,16 @@ define void @add1f(float* noalias %dst,
>  ; CHECK-LABEL: @add1f(
>  ; CHECK-NEXT:  entry:
>  ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float,
> float* [[SRC:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float,
> float* [[DST:%.*]], i64 1
> -; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float,
> float* [[SRC]], i64 2
> -; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
> -; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
>  ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float,
> float* [[DST]], i64 2
> -; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float,
> float* [[SRC]], i64 3
> -; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align
> 4
> -; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
>  ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float,
> float* [[DST]], i64 3
> -; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
> -; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align
> 4
> -; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
> -; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]],
> align 4
> +; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float
> 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>,
> [[TMP1]]
> +; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
> +; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align
> 4
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -496,22 +458,16 @@ define void @sub0f(float* noalias %dst,
>  ; CHECK-LABEL: @sub0f(
>  ; CHECK-NEXT:  entry:
>  ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float,
> float* [[SRC:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
> -; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
>  ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float,
> float* [[DST:%.*]], i64 1
> -; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float,
> float* [[SRC]], i64 2
> -; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float,
> float* [[DST]], i64 2
> -; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float,
> float* [[SRC]], i64 3
> -; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align
> 4
> -; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
>  ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float,
> float* [[DST]], i64 3
> -; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
> -; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align
> 4
> -; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00
> -; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]],
> align 4
> +; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float
> -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float
> -3.000000e+00>, [[TMP1]]
> +; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
> +; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align
> 4
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -615,22 +571,18 @@ define void @addsub0f(float* noalias %ds
>  ; CHECK-LABEL: @addsub0f(
>  ; CHECK-NEXT:  entry:
>  ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float,
> float* [[SRC:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
> -; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
>  ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float,
> float* [[DST:%.*]], i64 1
> -; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float,
> float* [[SRC]], i64 2
> -; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float,
> float* [[DST]], i64 2
> -; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float,
> float* [[SRC]], i64 3
> -; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align
> 4
> -; CHECK-NEXT:    [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
>  ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float,
> float* [[DST]], i64 3
> -; CHECK-NEXT:    store float [[SUB5]], float* [[INCDEC_PTR3]], align 4
> -; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align
> 4
> -; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
> -; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]],
> align 4
> +; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float
> -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>
> +; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], <float
> -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>
> +; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x
> float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
> +; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
> +; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align
> 4
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -658,22 +610,18 @@ define void @addsub1f(float* noalias %ds
>  ; CHECK-LABEL: @addsub1f(
>  ; CHECK-NEXT:  entry:
>  ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float,
> float* [[SRC:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
> -; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
>  ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float,
> float* [[DST:%.*]], i64 1
> -; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float,
> float* [[SRC]], i64 2
> -; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
> -; CHECK-NEXT:    [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
>  ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float,
> float* [[DST]], i64 2
> -; CHECK-NEXT:    store float [[SUB1]], float* [[INCDEC_PTR1]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float,
> float* [[SRC]], i64 3
> -; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align
> 4
>  ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float,
> float* [[DST]], i64 3
> -; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR3]], align 4
> -; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align
> 4
> -; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
> -; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]],
> align 4
> +; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float
> -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -3.000000e+00>
> +; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], <float
> -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -3.000000e+00>
> +; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x
> float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
> +; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
> +; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align
> 4
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -701,22 +649,16 @@ define void @mulf(float* noalias %dst, f
>  ; CHECK-LABEL: @mulf(
>  ; CHECK-NEXT:  entry:
>  ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float,
> float* [[SRC:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
> -; CHECK-NEXT:    [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
>  ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float,
> float* [[DST:%.*]], i64 1
> -; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float,
> float* [[SRC]], i64 2
> -; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
> -; CHECK-NEXT:    [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
>  ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float,
> float* [[DST]], i64 2
> -; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float,
> float* [[SRC]], i64 3
> -; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align
> 4
>  ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float,
> float* [[DST]], i64 3
> -; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
> -; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align
> 4
> -; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
> -; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]],
> align 4
> +; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> <float
> 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float
> -9.000000e+00>, [[TMP1]]
> +; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
> +; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align
> 4
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -825,22 +767,16 @@ define void @sub0fn(float* noalias %dst,
>  ; CHECK-LABEL: @sub0fn(
>  ; CHECK-NEXT:  entry:
>  ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float,
> float* [[SRC:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
> -; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
>  ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float,
> float* [[DST:%.*]], i64 1
> -; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float,
> float* [[SRC]], i64 2
> -; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float,
> float* [[DST]], i64 2
> -; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
>  ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float,
> float* [[SRC]], i64 3
> -; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align
> 4
> -; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00
>  ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float,
> float* [[DST]], i64 3
> -; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
> -; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align
> 4
> -; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00
> -; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]],
> align 4
> +; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float -1.000000e+00,
> float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
> +; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
> +; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align
> 4
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>

-- 
Regards,
Ilya Biryukov
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20170915/9b71ff20/attachment-0001.html>