[llvm] r318239 - Revert r318193 "[SLPVectorizer] Failure to beneficially vectorize 'copyable' elements in integer binary ops."

Wed Nov 15 23:08:23 PST 2017

On 11/15/2017 03:56 AM, Richard Smith via llvm-commits wrote:
> On 14 November 2017 at 16:38, Hans Wennborg via llvm-commits 
> <llvm-commits at lists.llvm.org <mailto:llvm-commits at lists.llvm.org>> wrote:
> 
>     Author: hans
>     Date: Tue Nov 14 16:38:13 2017
>     New Revision: 318239
> 
>     URL: http://llvm.org/viewvc/llvm-project?rev=318239&view=rev
>     <http://llvm.org/viewvc/llvm-project?rev=318239&view=rev>
>     Log:
>     Revert r318193 "[SLPVectorizer] Failure to beneficially vectorize
>     'copyable' elements in integer binary ops."
> 
>     It crashes building sqlite; see reply on the llvm-commits thread.
> 
> 
> FWIW, this also crashes while performing an optimized bootstrap of Clang.

We see crashes when testing our out-of-tree target as well:

  #0 0x0000000001fe9714 PrintStackTraceSignalHandler(void*) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x1fe9714)
  #1 0x0000000001fe9e86 SignalHandler(int) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x1fe9e86)
  #2 0x00007f2d070a1330 __restore_rt 
(/lib/x86_64-linux-gnu/libpthread.so.0+0x10330)
  #3 0x000000000213db3d 
llvm::slpvectorizer::BoUpSLP::BlockScheduling::doForAllOpcodes(llvm::Value*, 
llvm::function_ref<void (llvm::slpvectorizer::BoUpSLP::ScheduleData*)>) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x213db3d)
  #4 0x0000000002130901 
llvm::slpvectorizer::BoUpSLP::BlockScheduling::tryScheduleBundle(llvm::ArrayRef<llvm::Value*>, 
llvm::slpvectorizer::BoUpSLP*, llvm::Value*) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x2130901)
  #5 0x000000000212c3d1 
llvm::slpvectorizer::BoUpSLP::buildTree_rec(llvm::ArrayRef<llvm::Value*>, 
unsigned int, int) (/data/repo/llvm-dev2/build-all/bin/opt+0x212c3d1)
  #6 0x000000000212ac65 
llvm::slpvectorizer::BoUpSLP::buildTree(llvm::ArrayRef<llvm::Value*>, 
llvm::MapVector<llvm::Value*, llvm::SmallVector<llvm::Instruction*, 2u>, 
llvm::DenseMap<llvm::Value*, unsigned int, 
llvm::DenseMapInfo<llvm::Value*>, 
llvm::detail::DenseMapPair<llvm::Value*, unsigned int> >, 
std::vector<std::pair<llvm::Value*, 
llvm::SmallVector<llvm::Instruction*, 2u> >, 
std::allocator<std::pair<llvm::Value*, 
llvm::SmallVector<llvm::Instruction*, 2u> > > > >&, 
llvm::ArrayRef<llvm::Value*>) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x212ac65)
  #7 0x0000000002145322 
llvm::SLPVectorizerPass::tryToVectorizeList(llvm::ArrayRef<llvm::Value*>, 
llvm::slpvectorizer::BoUpSLP&, llvm::ArrayRef<llvm::Value*>, bool) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x2145322)
  #8 0x0000000002145d1b 
llvm::SLPVectorizerPass::tryToVectorize(llvm::Instruction*, 
llvm::slpvectorizer::BoUpSLP&) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x2145d1b)
  #9 0x00000000021469b3 
tryToVectorizeHorReductionOrInstOperands(llvm::PHINode*, 
llvm::Instruction*, llvm::BasicBlock*, llvm::slpvectorizer::BoUpSLP&, 
llvm::TargetTransformInfo*, llvm::function_ref<bool (llvm::Instruction*, 
llvm::slpvectorizer::BoUpSLP&)>) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x21469b3)
  #10 0x0000000002142263 
llvm::SLPVectorizerPass::vectorizeChainsInBlock(llvm::BasicBlock*, 
llvm::slpvectorizer::BoUpSLP&) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x2142263)
  #11 0x0000000002140e00 
llvm::SLPVectorizerPass::runImpl(llvm::Function&, 
llvm::ScalarEvolution*, llvm::TargetTransformInfo*, 
llvm::TargetLibraryInfo*, llvm::AAResults*, llvm::LoopInfo*, 
llvm::DominatorTree*, llvm::AssumptionCache*, llvm::DemandedBits*, 
llvm::OptimizationRemarkEmitter*) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x2140e00)
  #12 0x00000000021511a7 (anonymous 
namespace)::SLPVectorizer::runOnFunction(llvm::Function&) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x21511a7)
  #13 0x0000000001abf278 
llvm::FPPassManager::runOnFunction(llvm::Function&) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x1abf278)
  #14 0x0000000001abf4b8 llvm::FPPassManager::runOnModule(llvm::Module&) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x1abf4b8)
  #15 0x0000000001abf995 
llvm::legacy::PassManagerImpl::run(llvm::Module&) 
(/data/repo/llvm-dev2/build-all/bin/opt+0x1abf995)
  #16 0x000000000073bfbb main 
(/data/repo/llvm-dev2/build-all/bin/opt+0x73bfbb)
  #17 0x00007f2d05c7bf45 __libc_start_main 
/build/eglibc-SvCtMH/eglibc-2.19/csu/libc-start.c:321:0
  #18 0x0000000000729b61 _start 
(/data/repo/llvm-dev2/build-all/bin/opt+0x729b61)
  Stack dump:

Haven't looked into it at all since I saw it was reverted anyway.

/Mikael

> 
>      > [SLPVectorizer] Failure to beneficially vectorize 'copyable'
>     elements in integer binary ops.
>      >
>      >         Patch tries to improve vectorization of the following code:
>      >
>      >         void add1(int * __restrict dst, const int * __restrict src) {
>      >           *dst++ = *src++;
>      >           *dst++ = *src++ + 1;
>      >           *dst++ = *src++ + 2;
>      >           *dst++ = *src++ + 3;
>      >         }
>      >         Allows to vectorize even if the very first operation is
>     not a binary add, but just a load.
>      >
>      >         Fixed issues related to previous commit.
>      >
>      >         Reviewers: spatel, mzolotukhin, mkuper, hfinkel, RKSimon,
>     filcab, ABataev
>      >
>      >         Reviewed By: ABataev, RKSimon
>      >
>      >         Subscribers: llvm-commits, RKSimon
>      >
>      >         Differential Revision: https://reviews.llvm.org/D28907
>     <https://reviews.llvm.org/D28907>
> 
>     Removed:
>          llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
>     Modified:
>          llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>         
>     llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
> 
>     Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>     URL:
>     http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=318239&r1=318238&r2=318239&view=diff
>     <http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=318239&r1=318238&r2=318239&view=diff>
>     ==============================================================================
>     --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
>     +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Tue Nov 14
>     16:38:13 2017
>     @@ -333,7 +333,7 @@ static unsigned getAltOpcode(unsigned Op
>         case Instruction::Sub:
>           return Instruction::Add;
>         default:
>     -    return Op;
>     +    return 0;
>         }
>       }
> 
>     @@ -346,20 +346,6 @@ static bool sameOpcodeOrAlt(unsigned Opc
>         return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
>       }
> 
>     -/// Checks if the \p Opcode can be considered as an operand of a
>     (possibly)
>     -/// binary operation \p I.
>     -/// \returns The code of the binary operation of instruction \p I
>     if the
>     -/// instruction with \p Opcode can be considered as an operand of
>     \p I with the
>     -/// default value.
>     -static unsigned tryToRepresentAsInstArg(unsigned Opcode,
>     Instruction *I) {
>     -  assert(!sameOpcodeOrAlt(Opcode, getAltOpcode(Opcode), I->getOpcode())
>     -           && "Invalid Opcode");
>     -  if (Opcode != Instruction::PHI && isa<BinaryOperator>(I) &&
>     -      (I->getType()->isIntegerTy() ||
>     cast<FPMathOperator>(I)->isFast()))
>     -    return I->getOpcode();
>     -  return 0;
>     -}
>     -
>       /// Chooses the correct key for scheduling data. If \p Op has the
>     same (or
>       /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise
>     the key is \p
>       /// OpValue.
>     @@ -381,12 +367,7 @@ namespace {
>       struct RawInstructionsData {
>         /// Main Opcode of the instructions going to be vectorized.
>         unsigned Opcode = 0;
>     -  /// Position of the first instruction with the \a Opcode.
>     -  unsigned OpcodePos = 0;
>     -  /// Need an additional analysis (if at least one of the
>     instruction is not
>     -  /// same instruction kind as an instruction at OpcodePos position
>     in the
>     -  /// list).
>     -  bool NeedAnalysis = false;
>     +
>         /// The list of instructions have some instructions with
>     alternate opcodes.
>         bool HasAltOpcodes = false;
>       };
>     @@ -401,38 +382,16 @@ static RawInstructionsData getMainOpcode
>           return {};
>         RawInstructionsData Res;
>         unsigned Opcode = I0->getOpcode();
>     -  unsigned AltOpcode = getAltOpcode(Opcode);
>     -  unsigned NewOpcodePos = 0;
>         // Walk through the list of the vectorized instructions
>         // in order to check its structure described by RawInstructionsData.
>         for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
>           auto *I = dyn_cast<Instruction>(VL[Cnt]);
>           if (!I)
>             return {};
>     -    if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) {
>     -      if (Opcode != I->getOpcode()) {
>     -        Res.HasAltOpcodes = true;
>     -        if (Res.NeedAnalysis && isOdd(NewOpcodePos))
>     -          std::swap(Opcode, AltOpcode);
>     -      }
>     -      continue;
>     -    }
>     -    if (unsigned NewOpcode = tryToRepresentAsInstArg(Opcode, I)) {
>     -      if (!Instruction::isBinaryOp(Opcode) ||
>     -          !Instruction::isCommutative(Opcode)) {
>     -        NewOpcodePos = Cnt;
>     -        Opcode = NewOpcode;
>     -        AltOpcode = getAltOpcode(Opcode);
>     -        Res.NeedAnalysis = true;
>     -      }
>     -    } else if (tryToRepresentAsInstArg(I->getOpcode(),
>     -                                     
>       cast<Instruction>(VL[NewOpcodePos])))
>     -      Res.NeedAnalysis = true;
>     -    else
>     -      return {};
>     +    if (Opcode != I->getOpcode())
>     +      Res.HasAltOpcodes = true;
>         }
>         Res.Opcode = Opcode;
>     -  Res.OpcodePos = NewOpcodePos;
>         return Res;
>       }
> 
>     @@ -462,20 +421,16 @@ struct InstructionsState {
>       static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
>         auto Res = getMainOpcode(VL);
>         unsigned Opcode = Res.Opcode;
>     -  if (!Res.NeedAnalysis && !Res.HasAltOpcodes)
>     -    return InstructionsState(VL[Res.OpcodePos], Opcode, false);
>     -  auto *OpInst = cast<Instruction>(VL[Res.OpcodePos]);
>     +  if (!Res.HasAltOpcodes)
>     +    return InstructionsState(VL[0], Opcode, false);
>     +  auto *OpInst = cast<Instruction>(VL[0]);
>         unsigned AltOpcode = getAltOpcode(Opcode);
>         // Examine each element in the list instructions VL to determine
>         // if some operations there could be considered as an alternative
>     -  // (for example as subtraction relates to addition operation) or
>     -  // operation could be an operand of a (possibly) binary operation.
>     +  // (for example as subtraction relates to addition operation).
>         for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
>           auto *I = cast<Instruction>(VL[Cnt]);
>           unsigned InstOpcode = I->getOpcode();
>     -    if (Res.NeedAnalysis && !sameOpcodeOrAlt(Opcode, AltOpcode,
>     InstOpcode))
>     -      if (tryToRepresentAsInstArg(InstOpcode, OpInst))
>     -        InstOpcode = (Res.HasAltOpcodes && isOdd(Cnt)) ? AltOpcode
>     : Opcode;
>           if ((Res.HasAltOpcodes &&
>                InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
>               (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
>     @@ -628,7 +583,6 @@ public:
>         void deleteTree() {
>           VectorizableTree.clear();
>           ScalarToTreeEntry.clear();
>     -    ExtraScalarToTreeEntry.clear();
>           MustGather.clear();
>           ExternalUses.clear();
>           NumLoadsWantToKeepOrder = 0;
>     @@ -768,40 +722,22 @@ private:
>           /// The TreeEntry index containing the user of this entry.  We
>     can actually
>           /// have multiple users so the data structure is not truly a tree.
>           SmallVector<int, 1> UserTreeIndices;
>     -
>     -    /// Info about instruction in this tree entry.
>     -    InstructionsState State;
>         };
> 
>         /// Create a new VectorizableTree entry.
>         TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
>     -                          int &UserTreeIdx, const InstructionsState
>     &S) {
>     -    assert((!Vectorized || S.Opcode != 0) &&
>     -           "Vectorized TreeEntry without opcode");
>     +                          int &UserTreeIdx) {
>           VectorizableTree.emplace_back(VectorizableTree);
>           int idx = VectorizableTree.size() - 1;
>           TreeEntry *Last = &VectorizableTree[idx];
>           Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
>           Last->NeedToGather = !Vectorized;
>           if (Vectorized) {
>     -      Last->State = S;
>     -      unsigned AltOpcode = getAltOpcode(S.Opcode);
>             for (int i = 0, e = VL.size(); i != e; ++i) {
>     -        unsigned RealOpcode =
>     -            (S.IsAltShuffle && isOdd(i)) ? AltOpcode : S.Opcode;
>     -        Value *Key = (cast<Instruction>(VL[i])->getOpcode() ==
>     RealOpcode)
>     -                         ? VL[i]
>     -                         : S.OpValue;
>     -        assert(!getTreeEntry(VL[i], Key) && "Scalar already in tree!");
>     -        if (VL[i] == Key)
>     -          ScalarToTreeEntry[Key] = idx;
>     -        else
>     -          ExtraScalarToTreeEntry[VL[i]][Key] = idx;
>     +        assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
>     +        ScalarToTreeEntry[VL[i]] = idx;
>             }
>           } else {
>     -      Last->State.Opcode = 0;
>     -      Last->State.OpValue = VL[0];
>     -      Last->State.IsAltShuffle = false;
>             MustGather.insert(VL.begin(), VL.end());
>           }
> 
>     @@ -829,24 +765,8 @@ private:
>           return nullptr;
>         }
> 
>     -  TreeEntry *getTreeEntry(Value *V, Value *OpValue) {
>     -    if (V == OpValue)
>     -      return getTreeEntry(V);
>     -    auto I = ExtraScalarToTreeEntry.find(V);
>     -    if (I != ExtraScalarToTreeEntry.end()) {
>     -      auto &STT = I->second;
>     -      auto STTI = STT.find(OpValue);
>     -      if (STTI != STT.end())
>     -        return &VectorizableTree[STTI->second];
>     -    }
>     -    return nullptr;
>     -  }
>     -
>         /// Maps a specific scalar to its tree entry.
>     -  SmallDenseMap<Value *, int> ScalarToTreeEntry;
>     -
>     -  /// Maps a specific scalar to its tree entry(s) with leading scalar.
>     -  SmallDenseMap<Value *, SmallDenseMap<Value *, int>>
>     ExtraScalarToTreeEntry;
>     +  SmallDenseMap<Value*, int> ScalarToTreeEntry;
> 
>         /// A list of scalars that we found that we need to keep as scalars.
>         ValueSet MustGather;
>     @@ -1418,15 +1338,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *
>             continue;
> 
>           // For each lane:
>     -    const unsigned Opcode = Entry->State.Opcode;
>     -    const unsigned AltOpcode = getAltOpcode(Opcode);
>           for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE;
>     ++Lane) {
>             Value *Scalar = Entry->Scalars[Lane];
> 
>     -      if (!sameOpcodeOrAlt(Opcode, AltOpcode,
>     -                           cast<Instruction>(Scalar)->getOpcode()))
>     -        continue;
>     -
>             // Check if the scalar is externally used as an extra arg.
>             auto ExtI = ExternallyUsedValues.find(Scalar);
>             if (ExtI != ExternallyUsedValues.end()) {
>     @@ -1469,37 +1383,6 @@ void BoUpSLP::buildTree(ArrayRef<Value *
>         }
>       }
> 
>     -static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) {
>     -  switch(Opcode) {
>     -  case Instruction::Add:
>     -  case Instruction::Sub:
>     -  case Instruction::Or:
>     -  case Instruction::Xor:
>     -  case Instruction::Shl:
>     -  case Instruction::LShr:
>     -  case Instruction::AShr:
>     -    return ConstantInt::getNullValue(Ty);
>     -  case Instruction::Mul:
>     -  case Instruction::UDiv:
>     -  case Instruction::SDiv:
>     -  case Instruction::URem:
>     -  case Instruction::SRem:
>     -    return ConstantInt::get(Ty, /*V=*/1);
>     -  case Instruction::FAdd:
>     -  case Instruction::FSub:
>     -    return ConstantFP::get(Ty, /*V=*/0.0);
>     -  case Instruction::FMul:
>     -  case Instruction::FDiv:
>     -  case Instruction::FRem:
>     -    return ConstantFP::get(Ty, /*V=*/1.0);
>     -  case Instruction::And:
>     -    return ConstantInt::getAllOnesValue(Ty);
>     -  default:
>     -    break;
>     -  }
>     -  llvm_unreachable("unknown binop for default constant value");
>     -}
>     -
>       void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
>                                   int UserTreeIdx) {
>         assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
>     @@ -1507,46 +1390,31 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>         InstructionsState S = getSameOpcode(VL);
>         if (Depth == RecursionMaxDepth) {
>           DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
>     -    newTreeEntry(VL, false, UserTreeIdx, S);
>     +    newTreeEntry(VL, false, UserTreeIdx);
>           return;
>         }
> 
>         // Don't handle vectors.
>         if (S.OpValue->getType()->isVectorTy()) {
>           DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
>     -    newTreeEntry(VL, false, UserTreeIdx, S);
>     +    newTreeEntry(VL, false, UserTreeIdx);
>           return;
>         }
> 
>         if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
>           if (SI->getValueOperand()->getType()->isVectorTy()) {
>             DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
>     -      newTreeEntry(VL, false, UserTreeIdx, S);
>     +      newTreeEntry(VL, false, UserTreeIdx);
>             return;
>           }
> 
>         // If all of the operands are identical or constant we have a
>     simple solution.
>         if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) ||
>     !S.Opcode) {
>           DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
>     -    newTreeEntry(VL, false, UserTreeIdx, S);
>     +    newTreeEntry(VL, false, UserTreeIdx);
>           return;
>         }
> 
>     -  // Avoid any vectors that are wider than two elements and
>     -  // with real operations less than or equal to half of vector
>     -  // to others members are operands to that operations.
>     -  unsigned AltOpcode = getAltOpcode(S.Opcode);
>     -  unsigned SameOrAlt = 0;
>     -  if (VL.size() > 2) {
>     -    for (Value *V : VL) {
>     -      auto *Instr = cast<Instruction>(V);
>     -      if (sameOpcodeOrAlt(S.Opcode, AltOpcode, Instr->getOpcode()))
>     -        SameOrAlt++;
>     -    }
>     -    if (SameOrAlt <= (VL.size() / 2))
>     -      return;
>     -  }
>     -
>         // We now know that this is a vector of instructions of the same
>     type from
>         // the same block.
> 
>     @@ -1555,7 +1423,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>           if (EphValues.count(VL[i])) {
>             DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
>                   ") is ephemeral.\n");
>     -      newTreeEntry(VL, false, UserTreeIdx, S);
>     +      newTreeEntry(VL, false, UserTreeIdx);
>             return;
>           }
>         }
>     @@ -1566,7 +1434,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
>             if (E->Scalars[i] != VL[i]) {
>               DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
>     -        newTreeEntry(VL, false, UserTreeIdx, S);
>     +        newTreeEntry(VL, false, UserTreeIdx);
>               return;
>             }
>           }
>     @@ -1585,7 +1453,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>           if (getTreeEntry(I)) {
>             DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
>                   ") is already in tree.\n");
>     -      newTreeEntry(VL, false, UserTreeIdx, S);
>     +      newTreeEntry(VL, false, UserTreeIdx);
>             return;
>           }
>         }
>     @@ -1595,7 +1463,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>         for (unsigned i = 0, e = VL.size(); i != e; ++i) {
>           if (MustGather.count(VL[i])) {
>             DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
>     -      newTreeEntry(VL, false, UserTreeIdx, S);
>     +      newTreeEntry(VL, false, UserTreeIdx);
>             return;
>           }
>         }
>     @@ -1609,7 +1477,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>           // Don't go into unreachable blocks. They may contain
>     instructions with
>           // dependency cycles which confuse the final scheduling.
>           DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
>     -    newTreeEntry(VL, false, UserTreeIdx, S);
>     +    newTreeEntry(VL, false, UserTreeIdx);
>           return;
>         }
> 
>     @@ -1618,7 +1486,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>           for (unsigned j = i + 1; j < e; ++j)
>             if (VL[i] == VL[j]) {
>               DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
>     -        newTreeEntry(VL, false, UserTreeIdx, S);
>     +        newTreeEntry(VL, false, UserTreeIdx);
>               return;
>             }
> 
>     @@ -1633,7 +1501,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>           assert((!BS.getScheduleData(VL0) ||
>                   !BS.getScheduleData(VL0)->isPartOfBundle()) &&
>                  "tryScheduleBundle should cancelScheduling on failure");
>     -    newTreeEntry(VL, false, UserTreeIdx, S);
>     +    newTreeEntry(VL, false, UserTreeIdx);
>           return;
>         }
>         DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
>     @@ -1652,12 +1520,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>                 if (Term) {
>                   DEBUG(dbgs() << "SLP: Need to swizzle PHINodes
>     (TerminatorInst use).\n");
>                   BS.cancelScheduling(VL, VL0);
>     -            newTreeEntry(VL, false, UserTreeIdx, S);
>     +            newTreeEntry(VL, false, UserTreeIdx);
>                   return;
>                 }
>               }
> 
>     -      newTreeEntry(VL, true, UserTreeIdx, S);
>     +      newTreeEntry(VL, true, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
> 
>             for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e;
>     ++i) {
>     @@ -1679,7 +1547,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             } else {
>               BS.cancelScheduling(VL, VL0);
>             }
>     -      newTreeEntry(VL, Reuse, UserTreeIdx, S);
>     +      newTreeEntry(VL, Reuse, UserTreeIdx);
>             return;
>           }
>           case Instruction::Load: {
>     @@ -1694,7 +1562,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             if (DL->getTypeSizeInBits(ScalarTy) !=
>                 DL->getTypeAllocSizeInBits(ScalarTy)) {
>               BS.cancelScheduling(VL, VL0);
>     -        newTreeEntry(VL, false, UserTreeIdx, S);
>     +        newTreeEntry(VL, false, UserTreeIdx);
>               DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
>               return;
>             }
>     @@ -1705,7 +1573,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>               LoadInst *L = cast<LoadInst>(VL[i]);
>               if (!L->isSimple()) {
>                 BS.cancelScheduling(VL, VL0);
>     -          newTreeEntry(VL, false, UserTreeIdx, S);
>     +          newTreeEntry(VL, false, UserTreeIdx);
>                 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
>                 return;
>               }
>     @@ -1727,7 +1595,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
> 
>             if (Consecutive) {
>               ++NumLoadsWantToKeepOrder;
>     -        newTreeEntry(VL, true, UserTreeIdx, S);
>     +        newTreeEntry(VL, true, UserTreeIdx);
>               DEBUG(dbgs() << "SLP: added a vector of loads.\n");
>               return;
>             }
>     @@ -1742,7 +1610,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>                 }
> 
>             BS.cancelScheduling(VL, VL0);
>     -      newTreeEntry(VL, false, UserTreeIdx, S);
>     +      newTreeEntry(VL, false, UserTreeIdx);
> 
>             if (ReverseConsecutive) {
>               ++NumLoadsWantToChangeOrder;
>     @@ -1769,12 +1637,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>               Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
>               if (Ty != SrcTy || !isValidElementType(Ty)) {
>                 BS.cancelScheduling(VL, VL0);
>     -          newTreeEntry(VL, false, UserTreeIdx, S);
>     +          newTreeEntry(VL, false, UserTreeIdx);
>                 DEBUG(dbgs() << "SLP: Gathering casts with different src
>     types.\n");
>                 return;
>               }
>             }
>     -      newTreeEntry(VL, true, UserTreeIdx, S);
>     +      newTreeEntry(VL, true, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: added a vector of casts.\n");
> 
>             for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
>     @@ -1797,13 +1665,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>               if (Cmp->getPredicate() != P0 ||
>                   Cmp->getOperand(0)->getType() != ComparedTy) {
>                 BS.cancelScheduling(VL, VL0);
>     -          newTreeEntry(VL, false, UserTreeIdx, S);
>     +          newTreeEntry(VL, false, UserTreeIdx);
>                 DEBUG(dbgs() << "SLP: Gathering cmp with different
>     predicate.\n");
>                 return;
>               }
>             }
> 
>     -      newTreeEntry(VL, true, UserTreeIdx, S);
>     +      newTreeEntry(VL, true, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: added a vector of compares.\n");
> 
>             for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
>     @@ -1835,7 +1703,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>           case Instruction::And:
>           case Instruction::Or:
>           case Instruction::Xor:
>     -      newTreeEntry(VL, true, UserTreeIdx, S);
>     +      newTreeEntry(VL, true, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
> 
>             // Sort operands of the instructions so that each side is
>     more likely to
>     @@ -1851,21 +1719,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
>               ValueList Operands;
>               // Prepare the operand vector.
>     -        for (Value *VecOp : VL) {
>     -          auto *I = cast<Instruction>(VecOp);
>     -          if (I->getOpcode() == S.Opcode) {
>     -             Operands.push_back(I->getOperand(i));
>     -             continue;
>     -          }
>     -          assert(Instruction::isBinaryOp(S.Opcode) &&
>     -                  "Expected a binary operation.");
>     -          Value *Operand = isOdd(i)
>     -                        ? getDefaultConstantForOpcode(S.Opcode,
>     I->getType())
>     -                        : VecOp;
>     -          Operands.push_back(Operand);
>     -        }
>     -        if (allSameType(Operands))
>     -          buildTree_rec(Operands, Depth + 1, UserTreeIdx);
>     +        for (Value *j : VL)
>     +          Operands.push_back(cast<Instruction>(j)->getOperand(i));
>     +
>     +        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
>             }
>             return;
> 
>     @@ -1875,7 +1732,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>               if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
>                 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested
>     indexes).\n");
>                 BS.cancelScheduling(VL, VL0);
>     -          newTreeEntry(VL, false, UserTreeIdx, S);
>     +          newTreeEntry(VL, false, UserTreeIdx);
>                 return;
>               }
>             }
>     @@ -1888,7 +1745,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>               if (Ty0 != CurTy) {
>                 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different
>     types).\n");
>                 BS.cancelScheduling(VL, VL0);
>     -          newTreeEntry(VL, false, UserTreeIdx, S);
>     +          newTreeEntry(VL, false, UserTreeIdx);
>                 return;
>               }
>             }
>     @@ -1900,12 +1757,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>                 DEBUG(
>                     dbgs() << "SLP: not-vectorizable GEP (non-constant
>     indexes).\n");
>                 BS.cancelScheduling(VL, VL0);
>     -          newTreeEntry(VL, false, UserTreeIdx, S);
>     +          newTreeEntry(VL, false, UserTreeIdx);
>                 return;
>               }
>             }
> 
>     -      newTreeEntry(VL, true, UserTreeIdx, S);
>     +      newTreeEntry(VL, true, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
>             for (unsigned i = 0, e = 2; i < e; ++i) {
>               ValueList Operands;
>     @@ -1922,12 +1779,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
>               if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
>                 BS.cancelScheduling(VL, VL0);
>     -          newTreeEntry(VL, false, UserTreeIdx, S);
>     +          newTreeEntry(VL, false, UserTreeIdx);
>                 DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
>                 return;
>               }
> 
>     -      newTreeEntry(VL, true, UserTreeIdx, S);
>     +      newTreeEntry(VL, true, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: added a vector of stores.\n");
> 
>             ValueList Operands;
>     @@ -1945,7 +1802,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
>             if (!isTriviallyVectorizable(ID)) {
>               BS.cancelScheduling(VL, VL0);
>     -        newTreeEntry(VL, false, UserTreeIdx, S);
>     +        newTreeEntry(VL, false, UserTreeIdx);
>               DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
>               return;
>             }
>     @@ -1959,7 +1816,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>                   getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
>                   !CI->hasIdenticalOperandBundleSchema(*CI2)) {
>                 BS.cancelScheduling(VL, VL0);
>     -          newTreeEntry(VL, false, UserTreeIdx, S);
>     +          newTreeEntry(VL, false, UserTreeIdx);
>                 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!="
>     << *VL[i]
>                              << "\n");
>                 return;
>     @@ -1970,7 +1827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>                 Value *A1J = CI2->getArgOperand(1);
>                 if (A1I != A1J) {
>                   BS.cancelScheduling(VL, VL0);
>     -            newTreeEntry(VL, false, UserTreeIdx, S);
>     +            newTreeEntry(VL, false, UserTreeIdx);
>                   DEBUG(dbgs() << "SLP: mismatched arguments in call:"
>     << *CI
>                                << " argument "<< A1I<<"!=" << A1J
>                                << "\n");
>     @@ -1983,14 +1840,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>                               CI->op_begin() +
>     CI->getBundleOperandsEndIndex(),
>                               CI2->op_begin() +
>     CI2->getBundleOperandsStartIndex())) {
>                 BS.cancelScheduling(VL, VL0);
>     -          newTreeEntry(VL, false, UserTreeIdx, S);
>     +          newTreeEntry(VL, false, UserTreeIdx);
>                 DEBUG(dbgs() << "SLP: mismatched bundle operands in
>     calls:" << *CI << "!="
>                              << *VL[i] << '\n');
>                 return;
>               }
>             }
> 
>     -      newTreeEntry(VL, true, UserTreeIdx, S);
>     +      newTreeEntry(VL, true, UserTreeIdx);
>             for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
>               ValueList Operands;
>               // Prepare the operand vector.
>     @@ -2007,11 +1864,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             // then do not vectorize this instruction.
>             if (!S.IsAltShuffle) {
>               BS.cancelScheduling(VL, VL0);
>     -        newTreeEntry(VL, false, UserTreeIdx, S);
>     +        newTreeEntry(VL, false, UserTreeIdx);
>               DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
>               return;
>             }
>     -      newTreeEntry(VL, true, UserTreeIdx, S);
>     +      newTreeEntry(VL, true, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
> 
>             // Reorder operands if reordering would enable vectorization.
>     @@ -2026,19 +1883,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
>               ValueList Operands;
>               // Prepare the operand vector.
>     -        for (Value *VecOp : VL) {
>     -          auto *I = cast<Instruction>(VecOp);
>     -          if (sameOpcodeOrAlt(S.Opcode, AltOpcode, I->getOpcode())) {
>     -            Operands.push_back(I->getOperand(i));
>     -            continue;
>     -          }
>     -          assert(Instruction::isBinaryOp(S.Opcode) &&
>     -                  "Expected a binary operation.");
>     -          Value *Operand = isOdd(i)
>     -                        ? getDefaultConstantForOpcode(S.Opcode,
>     I->getType())
>     -                        : VecOp;
>     -          Operands.push_back(Operand);
>     -        }
>     +        for (Value *j : VL)
>     +          Operands.push_back(cast<Instruction>(j)->getOperand(i));
> 
>               buildTree_rec(Operands, Depth + 1, UserTreeIdx);
>             }
>     @@ -2046,7 +1892,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
> 
>           default:
>             BS.cancelScheduling(VL, VL0);
>     -      newTreeEntry(VL, false, UserTreeIdx, S);
>     +      newTreeEntry(VL, false, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
>             return;
>         }
>     @@ -2167,17 +2013,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>           }
>           return getGatherCost(E->Scalars);
>         }
>     -  assert(E->State.Opcode && allSameType(VL) && allSameBlock(VL) &&
>     "Invalid VL");
>     -  auto *VL0 = cast<Instruction>(E->State.OpValue);
>     -  unsigned ShuffleOrOp = E->State.IsAltShuffle ?
>     -               (unsigned) Instruction::ShuffleVector : E->State.Opcode;
>     +  InstructionsState S = getSameOpcode(VL);
>     +  assert(S.Opcode && allSameType(VL) && allSameBlock(VL) &&
>     "Invalid VL");
>     +  Instruction *VL0 = cast<Instruction>(S.OpValue);
>     +  unsigned ShuffleOrOp = S.IsAltShuffle ?
>     +               (unsigned) Instruction::ShuffleVector : S.Opcode;
>         switch (ShuffleOrOp) {
>           case Instruction::PHI:
>             return 0;
> 
>           case Instruction::ExtractValue:
>           case Instruction::ExtractElement:
>     -      if (canReuseExtract(VL, E->State.OpValue)) {
>     +      if (canReuseExtract(VL, S.OpValue)) {
>               int DeadCost = 0;
>               for (unsigned i = 0, e = VL.size(); i < e; ++i) {
>                 Instruction *E = cast<Instruction>(VL[i]);
>     @@ -2221,8 +2068,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>             // Calculate the cost of this instruction.
>             VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(),
>     VL.size());
>             int ScalarCost = VecTy->getNumElements() *
>     -          TTI->getCmpSelInstrCost(ShuffleOrOp, ScalarTy,
>     Builder.getInt1Ty(), VL0);
>     -      int VecCost = TTI->getCmpSelInstrCost(ShuffleOrOp, VecTy,
>     MaskTy, VL0);
>     +          TTI->getCmpSelInstrCost(S.Opcode, ScalarTy,
>     Builder.getInt1Ty(), VL0);
>     +      int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy,
>     MaskTy, VL0);
>             return VecCost - ScalarCost;
>           }
>           case Instruction::Add:
>     @@ -2248,7 +2095,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>             TargetTransformInfo::OperandValueKind Op1VK =
>                 TargetTransformInfo::OK_AnyValue;
>             TargetTransformInfo::OperandValueKind Op2VK =
>     -          TargetTransformInfo::OK_AnyValue;
>     +          TargetTransformInfo::OK_UniformConstantValue;
>             TargetTransformInfo::OperandValueProperties Op1VP =
>                 TargetTransformInfo::OP_None;
>             TargetTransformInfo::OperandValueProperties Op2VP =
>     @@ -2259,33 +2106,34 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>             // If instead not all operands are constants, then set the
>     operand kind
>             // to OK_AnyValue. If all operands are constants but not the
>     same,
>             // then set the operand kind to OK_NonUniformConstantValue.
>     -      if (auto *CInt = dyn_cast<ConstantInt>(VL0->getOperand(1))) {
>     -        Op2VK = TargetTransformInfo::OK_UniformConstantValue;
>     -        const unsigned Opcode = E->State.Opcode;
>     -        for (auto *V : VL) {
>     -          auto *I = cast<Instruction>(V);
>     -          if (I == VL0 || Opcode != I->getOpcode())
>     -            continue;
>     -          if (!isa<ConstantInt>(I->getOperand(1))) {
>     -            Op2VK = TargetTransformInfo::OK_AnyValue;
>     -            break;
>     -          }
>     -          if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
>     -              CInt != cast<ConstantInt>(I->getOperand(1)))
>     -            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
>     +      ConstantInt *CInt = nullptr;
>     +      for (unsigned i = 0; i < VL.size(); ++i) {
>     +        const Instruction *I = cast<Instruction>(VL[i]);
>     +        if (!isa<ConstantInt>(I->getOperand(1))) {
>     +          Op2VK = TargetTransformInfo::OK_AnyValue;
>     +          break;
>     +        }
>     +        if (i == 0) {
>     +          CInt = cast<ConstantInt>(I->getOperand(1));
>     +          continue;
>               }
>     -        // FIXME: Currently cost of model modification for division
>     by power of
>     -        // 2 is handled for X86 and AArch64. Add support for other
>     targets.
>               if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
>     -            CInt->getValue().isPowerOf2())
>     -          Op2VP = TargetTransformInfo::OP_PowerOf2;
>     +            CInt != cast<ConstantInt>(I->getOperand(1)))
>     +          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
>             }
>     +      // FIXME: Currently cost of model modification for division
>     by power of
>     +      // 2 is handled for X86 and AArch64. Add support for other
>     targets.
>     +      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
>     CInt &&
>     +          CInt->getValue().isPowerOf2())
>     +        Op2VP = TargetTransformInfo::OP_PowerOf2;
> 
>     -      int ScalarCost = VecTy->getNumElements() *
>     -                       TTI->getArithmeticInstrCost(E->State.Opcode,
>     ScalarTy,
>     -                                                   Op1VK, Op2VK,
>     Op1VP, Op2VP);
>     -      int VecCost = TTI->getArithmeticInstrCost(E->State.Opcode,
>     VecTy, Op1VK,
>     -                                                Op2VK, Op1VP, Op2VP);
>     +      SmallVector<const Value *, 4> Operands(VL0->operand_values());
>     +      int ScalarCost =
>     +          VecTy->getNumElements() *
>     +          TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK,
>     Op2VK, Op1VP,
>     +                                      Op2VP, Operands);
>     +      int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy,
>     Op1VK, Op2VK,
>     +                                                Op1VP, Op2VP,
>     Operands);
>             return VecCost - ScalarCost;
>           }
>           case Instruction::GetElementPtr: {
>     @@ -2351,18 +2199,23 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>                 TargetTransformInfo::OK_AnyValue;
>             TargetTransformInfo::OperandValueKind Op2VK =
>                 TargetTransformInfo::OK_AnyValue;
>     -      unsigned AltOpcode = getAltOpcode(E->State.Opcode);
>     -      int ScalarCost =
>     -          TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy,
>     Op1VK, Op2VK) *
>     -          VL.size() / 2;
>     -      ScalarCost +=
>     -          TTI->getArithmeticInstrCost(AltOpcode, ScalarTy, Op1VK,
>     Op2VK) *
>     -          VL.size() / 2;
>     +      int ScalarCost = 0;
>     +      int VecCost = 0;
>     +      for (Value *i : VL) {
>     +        Instruction *I = cast<Instruction>(i);
>     +        if (!I)
>     +          break;
>     +        ScalarCost +=
>     +            TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy,
>     Op1VK, Op2VK);
>     +      }
>             // VecCost is equal to sum of the cost of creating 2 vectors
>             // and the cost of creating shuffle.
>     -      int VecCost =
>     -          TTI->getArithmeticInstrCost(E->State.Opcode, VecTy,
>     Op1VK, Op2VK);
>     -      VecCost += TTI->getArithmeticInstrCost(AltOpcode, VecTy,
>     Op1VK, Op2VK);
>     +      Instruction *I0 = cast<Instruction>(VL[0]);
>     +      VecCost =
>     +          TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy,
>     Op1VK, Op2VK);
>     +      Instruction *I1 = cast<Instruction>(VL[1]);
>     +      VecCost +=
>     +          TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy,
>     Op1VK, Op2VK);
>             VecCost +=
>                 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate,
>     VecTy, 0);
>             return VecCost - ScalarCost;
>     @@ -2428,7 +2281,7 @@ int BoUpSLP::getSpillCost() {
>         Instruction *PrevInst = nullptr;
> 
>         for (const auto &N : VectorizableTree) {
>     -    Instruction *Inst = dyn_cast<Instruction>(N.State.OpValue);
>     +    Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
>           if (!Inst)
>             continue;
> 
>     @@ -2488,7 +2341,7 @@ int BoUpSLP::getTreeCost() {
>         for (TreeEntry &TE : VectorizableTree) {
>           int C = getEntryCost(&TE);
>           DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that
>     starts with "
>     -                 << *TE.State.OpValue << ".\n");
>     +                 << *TE.Scalars[0] << ".\n");
>           Cost += C;
>         }
> 
>     @@ -2509,7 +2362,7 @@ int BoUpSLP::getTreeCost() {
>           // extend the extracted value back to the original type. Here,
>     we account
>           // for the extract and the added cost of the sign extend if
>     needed.
>           auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
>     -    auto *ScalarRoot = VectorizableTree[0].State.OpValue;
>     +    auto *ScalarRoot = VectorizableTree[0].Scalars[0];
>           if (MinBWs.count(ScalarRoot)) {
>             auto *MinTy = IntegerType::get(F->getContext(),
>     MinBWs[ScalarRoot].first);
>             auto Extend =
>     @@ -2572,15 +2425,13 @@ void BoUpSLP::reorderAltShuffleOperands(
>                                               SmallVectorImpl<Value *>
>     &Right) {
>         // Push left and right operands of binary operation into Left
>     and Right
>         unsigned AltOpcode = getAltOpcode(Opcode);
>     +  (void)AltOpcode;
>         for (Value *V : VL) {
>           auto *I = cast<Instruction>(V);
>     -    if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) {
>     -      Left.push_back(I->getOperand(0));
>     -      Right.push_back(I->getOperand(1));
>     -    } else {
>     -      Left.push_back(I);
>     -      Right.push_back(getDefaultConstantForOpcode(Opcode,
>     I->getType()));
>     -    }
>     +    assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&
>     +           "Incorrect instruction in vector");
>     +    Left.push_back(I->getOperand(0));
>     +    Right.push_back(I->getOperand(1));
>         }
> 
>         // Reorder if we have a commutative operation and consecutive access
>     @@ -2629,13 +2480,8 @@ static bool shouldReorderOperands(
>           int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
>           ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool
>     AllSameOpcodeRight,
>           bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
>     -  if (I.getOpcode() == Opcode) {
>     -    VLeft = I.getOperand(0);
>     -    VRight = I.getOperand(1);
>     -  } else {
>     -    VLeft = &I;
>     -    VRight = getDefaultConstantForOpcode(Opcode, I.getType());
>     -  }
>     +  VLeft = I.getOperand(0);
>     +  VRight = I.getOperand(1);
>         // If we have "SplatRight", try to see if commuting is needed to
>     preserve it.
>         if (SplatRight) {
>           if (VRight == Right[i - 1])
>     @@ -2699,15 +2545,8 @@ void BoUpSLP::reorderInputsAccordingToOp
>           // Peel the first iteration out of the loop since there's nothing
>           // interesting to do anyway and it simplifies the checks in
>     the loop.
>           auto *I = cast<Instruction>(VL[0]);
>     -    Value *VLeft;
>     -    Value *VRight;
>     -    if (I->getOpcode() == Opcode) {
>     -      VLeft = I->getOperand(0);
>     -      VRight = I->getOperand(1);
>     -    } else {
>     -      VLeft = I;
>     -      VRight = getDefaultConstantForOpcode(Opcode, I->getType());
>     -    }
>     +    Value *VLeft = I->getOperand(0);
>     +    Value *VRight = I->getOperand(1);
>           if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
>             // Favor having instruction to the right. FIXME: why?
>             std::swap(VLeft, VRight);
>     @@ -2912,11 +2751,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>         IRBuilder<>::InsertPointGuard Guard(Builder);
> 
>         if (E->VectorizedValue) {
>     -    DEBUG(dbgs() << "SLP: Diamond merged for " << *E->State.OpValue
>     << ".\n");
>     +    DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] <<
>     ".\n");
>           return E->VectorizedValue;
>         }
> 
>     -  Instruction *VL0 = cast<Instruction>(E->State.OpValue);
>     +  InstructionsState S = getSameOpcode(E->Scalars);
>     +  Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
>         Type *ScalarTy = VL0->getType();
>         if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
>           ScalarTy = SI->getValueOperand()->getType();
>     @@ -2929,8 +2769,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>           return V;
>         }
> 
>     -  unsigned ShuffleOrOp = E->State.IsAltShuffle ?
>     -           (unsigned) Instruction::ShuffleVector : E->State.Opcode;
>     +  unsigned ShuffleOrOp = S.IsAltShuffle ?
>     +           (unsigned) Instruction::ShuffleVector : S.Opcode;
>         switch (ShuffleOrOp) {
>           case Instruction::PHI: {
>             PHINode *PH = dyn_cast<PHINode>(VL0);
>     @@ -3040,7 +2880,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
> 
>             CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
>             Value *V;
>     -      if (E->State.Opcode == Instruction::FCmp)
>     +      if (S.Opcode == Instruction::FCmp)
>               V = Builder.CreateFCmp(P0, L, R);
>             else
>               V = Builder.CreateICmp(P0, L, R);
>     @@ -3092,19 +2932,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>           case Instruction::Xor: {
>             ValueList LHSVL, RHSVL;
>             if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
>     -        reorderInputsAccordingToOpcode(E->State.Opcode, E->Scalars,
>     LHSVL,
>     +        reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
>                                              RHSVL);
>             else
>               for (Value *V : E->Scalars) {
>                 auto *I = cast<Instruction>(V);
>     -          if (I->getOpcode() == E->State.Opcode) {
>     -            LHSVL.push_back(I->getOperand(0));
>     -            RHSVL.push_back(I->getOperand(1));
>     -          } else {
>     -            LHSVL.push_back(V);
>     -            RHSVL.push_back(
>     -                getDefaultConstantForOpcode(E->State.Opcode,
>     I->getType()));
>     -          }
>     +          LHSVL.push_back(I->getOperand(0));
>     +          RHSVL.push_back(I->getOperand(1));
>               }
> 
>             setInsertPointAfterBundle(E->Scalars, VL0);
>     @@ -3116,7 +2950,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>               return V;
> 
>             Value *V = Builder.CreateBinOp(
>     -          static_cast<Instruction::BinaryOps>(E->State.Opcode),
>     LHS, RHS);
>     +          static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
>             E->VectorizedValue = V;
>             propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
>             ++NumVectorInstructions;
>     @@ -3266,9 +3100,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>           }
>           case Instruction::ShuffleVector: {
>             ValueList LHSVL, RHSVL;
>     -      assert(Instruction::isBinaryOp(E->State.Opcode) &&
>     +      assert(Instruction::isBinaryOp(S.Opcode) &&
>                    "Invalid Shuffle Vector Operand");
>     -      reorderAltShuffleOperands(E->State.Opcode, E->Scalars, LHSVL,
>     RHSVL);
>     +      reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
>             setInsertPointAfterBundle(E->Scalars, VL0);
> 
>             Value *LHS = vectorizeTree(LHSVL);
>     @@ -3279,9 +3113,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
> 
>             // Create a vector of LHS op1 RHS
>             Value *V0 = Builder.CreateBinOp(
>     -          static_cast<Instruction::BinaryOps>(E->State.Opcode),
>     LHS, RHS);
>     +          static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
> 
>     -      unsigned AltOpcode = getAltOpcode(E->State.Opcode);
>     +      unsigned AltOpcode = getAltOpcode(S.Opcode);
>             // Create a vector of LHS op2 RHS
>             Value *V1 = Builder.CreateBinOp(
>                 static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);
>     @@ -3303,13 +3137,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>             }
> 
>             Value *ShuffleMask = ConstantVector::get(Mask);
>     -      InstructionsState S = getSameOpcode(EvenScalars);
>     -      assert(!S.IsAltShuffle && "Unexpected alternate opcode");
>     -      propagateIRFlags(V0, EvenScalars, S.OpValue);
>     -
>     -      S = getSameOpcode(OddScalars);
>     -      assert(!S.IsAltShuffle && "Unexpected alternate opcode");
>     -      propagateIRFlags(V1, OddScalars, S.OpValue);
>     +      propagateIRFlags(V0, EvenScalars);
>     +      propagateIRFlags(V1, OddScalars);
> 
>             Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
>             E->VectorizedValue = V;
>     @@ -3343,7 +3172,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebug
>         // If the vectorized tree can be rewritten in a smaller type, we
>     truncate the
>         // vectorized root. InstCombine will then rewrite the entire
>     expression. We
>         // sign extend the extracted values below.
>     -  auto *ScalarRoot = VectorizableTree[0].State.OpValue;
>     +  auto *ScalarRoot = VectorizableTree[0].Scalars[0];
>         if (MinBWs.count(ScalarRoot)) {
>           if (auto *I = dyn_cast<Instruction>(VectorRoot))
>             Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
>     @@ -3454,15 +3283,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebug
>           assert(Entry->VectorizedValue && "Can't find vectorizable value");
> 
>           // For each lane:
>     -    const unsigned Opcode = Entry->State.Opcode;
>     -    const unsigned AltOpcode = getAltOpcode(Opcode);
>           for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE;
>     ++Lane) {
>             Value *Scalar = Entry->Scalars[Lane];
> 
>     -      if (!sameOpcodeOrAlt(Opcode, AltOpcode,
>     -                           cast<Instruction>(Scalar)->getOpcode()))
>     -        continue;
>     -
>             Type *Ty = Scalar->getType();
>             if (!Ty->isVoidTy()) {
>       #ifndef NDEBUG
>     @@ -3594,7 +3417,7 @@ bool BoUpSLP::BlockScheduling::trySchedu
>         }
> 
>         for (Value *V : VL) {
>     -    ScheduleData *BundleMember = getScheduleData(V,
>     isOneOf(OpValue, V));
>     +    ScheduleData *BundleMember = getScheduleData(V);
>           assert(BundleMember &&
>                  "no ScheduleData for bundle member (maybe not in same
>     basic block)");
>           if (BundleMember->IsScheduled) {
>     @@ -3667,7 +3490,7 @@ void BoUpSLP::BlockScheduling::cancelSch
>         if (isa<PHINode>(OpValue))
>           return;
> 
>     -  ScheduleData *Bundle = getScheduleData(OpValue)->FirstInBundle;
>     +  ScheduleData *Bundle = getScheduleData(OpValue);
>         DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
>         assert(!Bundle->IsScheduled &&
>                "Can't cancel bundle which is already scheduled");
>     @@ -3972,7 +3795,7 @@ void BoUpSLP::scheduleBlock(BlockSchedul
>              I = I->getNextNode()) {
>           BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule,
>     BS](ScheduleData *SD) {
>             assert(SD->isPartOfBundle() ==
>     -                 (getTreeEntry(SD->Inst, SD->OpValue) != nullptr) &&
>     +                 (getTreeEntry(SD->Inst) != nullptr) &&
>                    "scheduler and vectorizer bundle mismatch");
>             SD->FirstInBundle->SchedulingPriority = Idx++;
>             if (SD->isSchedulingEntity()) {
>     @@ -3995,13 +3818,12 @@ void BoUpSLP::scheduleBlock(BlockSchedul
>           ScheduleData *BundleMember = picked;
>           while (BundleMember) {
>             Instruction *pickedInst = BundleMember->Inst;
>     -      if (pickedInst == BundleMember->OpValue) {
>     -        if (LastScheduledInst->getNextNode() != pickedInst) {
>     -          BS->BB->getInstList().remove(pickedInst);
>     -         
>     BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
>     pickedInst);
>     -        }
>     -        LastScheduledInst = pickedInst;
>     +      if (LastScheduledInst->getNextNode() != pickedInst) {
>     +        BS->BB->getInstList().remove(pickedInst);
>     +        BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
>     +                                     pickedInst);
>             }
>     +      LastScheduledInst = pickedInst;
>             BundleMember = BundleMember->NextInBundle;
>           }
> 
> 
>     Removed: llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
>     URL:
>     http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll?rev=318238&view=auto
>     <http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll?rev=318238&view=auto>
>     ==============================================================================
>     --- llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
>     (original)
>     +++ llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
>     (removed)
>     @@ -1,52 +0,0 @@
>     -; NOTE: Assertions have been autogenerated by
>     utils/update_test_checks.py
>     -; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -slp-vectorizer -S <
>     %s | FileCheck %s
>     -
>     - at bar = external global [4 x [4 x i32]], align 4
>     - at dct_luma = external global [4 x [4 x i32]], align 4
>     -
>     -define void @foo() local_unnamed_addr {
>     -; CHECK-LABEL: @foo(
>     -; CHECK-NEXT:  entry:
>     -; CHECK-NEXT:    [[ADD277:%.*]] = add nsw i32 undef, undef
>     -; CHECK-NEXT:    store i32 [[ADD277]], i32* getelementptr inbounds
>     ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4
>     -; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr
>     inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64
>     0), align 4
>     -; CHECK-NEXT:    [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x
>     [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0
>     -; CHECK-NEXT:    [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4
>     x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr
>     inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64
>     2), align 4
>     -; CHECK-NEXT:    [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4
>     x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
>     -; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr
>     inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64
>     3), align 4
>     -; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32
>     [[TMP0]], i32 0
>     -; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]],
>     i32 [[ADD277]], i32 1
>     -; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]],
>     i32 [[TMP1]], i32 2
>     -; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]],
>     i32 [[TMP2]], i32 3
>     -; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> undef, [[TMP6]]
>     -; CHECK-NEXT:    [[TMP8:%.*]] = ashr <4 x i32> [[TMP7]], <i32 6,
>     i32 6, i32 6, i32 6>
>     -; CHECK-NEXT:    [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4
>     x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
>     -; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4
>     x i32>*
>     -; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
>     -; CHECK-NEXT:    unreachable
>     -;
>     -entry:
>     -  %add277 = add nsw i32 undef, undef
>     -  store i32 %add277, i32* getelementptr inbounds ([4 x [4 x i32]],
>     [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4
>     -  %0 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x
>     [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4
>     -  %sub355 = add nsw i32 undef, %0
>     -  %shr.i = ashr i32 %sub355, 6
>     -  %arrayidx372 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x
>     i32]]* @dct_luma, i64 0, i64 3, i64 0
>     -  store i32 %shr.i, i32* %arrayidx372, align 4
>     -  %sub355.1 = add nsw i32 undef, %add277
>     -  %shr.i.1 = ashr i32 %sub355.1, 6
>     -  %arrayidx372.1 = getelementptr inbounds [4 x [4 x i32]], [4 x [4
>     x i32]]* @dct_luma, i64 0, i64 3, i64 1
>     -  store i32 %shr.i.1, i32* %arrayidx372.1, align 4
>     -  %1 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x
>     [4 x i32]]* @bar, i64 0, i64 3, i64 2), align 4
>     -  %sub355.2 = add nsw i32 undef, %1
>     -  %shr.i.2 = ashr i32 %sub355.2, 6
>     -  %arrayidx372.2 = getelementptr inbounds [4 x [4 x i32]], [4 x [4
>     x i32]]* @dct_luma, i64 0, i64 3, i64 2
>     -  store i32 %shr.i.2, i32* %arrayidx372.2, align 4
>     -  %2 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x
>     [4 x i32]]* @bar, i64 0, i64 3, i64 3), align 4
>     -  %sub355.3 = add nsw i32 undef, %2
>     -  %shr.i.3 = ashr i32 %sub355.3, 6
>     -  %arrayidx372.3 = getelementptr inbounds [4 x [4 x i32]], [4 x [4
>     x i32]]* @dct_luma, i64 0, i64 3, i64 3
>     -  store i32 %shr.i.3, i32* %arrayidx372.3, align 4
>     -  unreachable
>     -}
> 
>     Modified:
>     llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
>     URL:
>     http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll?rev=318239&r1=318238&r2=318239&view=diff
>     <http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll?rev=318239&r1=318238&r2=318239&view=diff>
>     ==============================================================================
>     ---
>     llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
>     (original)
>     +++
>     llvm/trunk/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
>     Tue Nov 14 16:38:13 2017
>     @@ -43,16 +43,22 @@ define void @add1(i32* noalias %dst, i32
>       ; CHECK-LABEL: @add1(
>       ; CHECK-NEXT:  entry:
>       ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC:%.*]], i64 1
>     +; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32,
>     i32* [[DST:%.*]], i64 1
>     +; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC]], i64 2
>     +; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
>     +; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
>       ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32,
>     i32* [[DST]], i64 2
>     +; CHECK-NEXT:    store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC]], i64 3
>     +; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
>     +; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
>       ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32,
>     i32* [[DST]], i64 3
>     -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>*
>     [[TMP0]], align 4
>     -; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 0, i32 1,
>     i32 2, i32 3>, [[TMP1]]
>     -; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
>     -; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
>     +; CHECK-NEXT:    store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
>     +; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
>     +; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
>     +; CHECK-NEXT:    store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
>       ; CHECK-NEXT:    ret void
>       ;
>       entry:
>     @@ -80,16 +86,22 @@ define void @sub0(i32* noalias %dst, i32
>       ; CHECK-LABEL: @sub0(
>       ; CHECK-NEXT:  entry:
>       ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC:%.*]], i64 1
>     +; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
>     +; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
>       ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32,
>     i32* [[DST:%.*]], i64 1
>     +; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC]], i64 2
>     +; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32,
>     i32* [[DST]], i64 2
>     +; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC]], i64 3
>     +; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
>     +; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
>       ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32,
>     i32* [[DST]], i64 3
>     -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>*
>     [[TMP0]], align 4
>     -; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 -1, i32 0,
>     i32 -2, i32 -3>, [[TMP1]]
>     -; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
>     -; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
>     +; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
>     +; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
>     +; CHECK-NEXT:    [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3
>     +; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
>       ; CHECK-NEXT:    ret void
>       ;
>       entry:
>     @@ -193,18 +205,22 @@ define void @addsub0(i32* noalias %dst,
>       ; CHECK-LABEL: @addsub0(
>       ; CHECK-NEXT:  entry:
>       ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC:%.*]], i64 1
>     +; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
>     +; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
>       ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32,
>     i32* [[DST:%.*]], i64 1
>     +; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC]], i64 2
>     +; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32,
>     i32* [[DST]], i64 2
>     +; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC]], i64 3
>     +; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
>     +; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
>       ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32,
>     i32* [[DST]], i64 3
>     -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>*
>     [[TMP0]], align 4
>     -; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32
>     -1, i32 0, i32 -2, i32 -3>
>     -; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], <i32
>     -1, i32 0, i32 -2, i32 -3>
>     -; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]],
>     <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
>     -; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
>     -; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
>     +; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
>     +; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
>     +; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
>     +; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
>       ; CHECK-NEXT:    ret void
>       ;
>       entry:
>     @@ -232,18 +248,22 @@ define void @addsub1(i32* noalias %dst,
>       ; CHECK-LABEL: @addsub1(
>       ; CHECK-NEXT:  entry:
>       ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC:%.*]], i64 1
>     +; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
>     +; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
>       ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32,
>     i32* [[DST:%.*]], i64 1
>     +; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC]], i64 2
>     +; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
>     +; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1
>       ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32,
>     i32* [[DST]], i64 2
>     +; CHECK-NEXT:    store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC]], i64 3
>     +; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32,
>     i32* [[DST]], i64 3
>     -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>*
>     [[TMP0]], align 4
>     -; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32
>     -1, i32 -1, i32 0, i32 -3>
>     -; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], <i32
>     -1, i32 -1, i32 0, i32 -3>
>     -; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]],
>     <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
>     -; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
>     -; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
>     +; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4
>     +; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
>     +; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
>     +; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
>       ; CHECK-NEXT:    ret void
>       ;
>       entry:
>     @@ -271,16 +291,22 @@ define void @mul(i32* noalias %dst, i32*
>       ; CHECK-LABEL: @mul(
>       ; CHECK-NEXT:  entry:
>       ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC:%.*]], i64 1
>     +; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
>     +; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257
>       ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32,
>     i32* [[DST:%.*]], i64 1
>     +; CHECK-NEXT:    store i32 [[MUL]], i32* [[DST]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC]], i64 2
>     +; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
>     +; CHECK-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3
>       ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32,
>     i32* [[DST]], i64 2
>     +; CHECK-NEXT:    store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC]], i64 3
>     +; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32,
>     i32* [[DST]], i64 3
>     -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>*
>     [[TMP0]], align 4
>     -; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> <i32 257, i32 -3,
>     i32 1, i32 -9>, [[TMP1]]
>     -; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
>     -; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
>     +; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4
>     +; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
>     +; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
>     +; CHECK-NEXT:    store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4
>       ; CHECK-NEXT:    ret void
>       ;
>       entry:
>     @@ -308,16 +334,22 @@ define void @shl0(i32* noalias %dst, i32
>       ; CHECK-LABEL: @shl0(
>       ; CHECK-NEXT:  entry:
>       ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC:%.*]], i64 1
>     +; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32,
>     i32* [[DST:%.*]], i64 1
>     +; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC]], i64 2
>     +; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
>     +; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP1]], 1
>       ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32,
>     i32* [[DST]], i64 2
>     +; CHECK-NEXT:    store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32,
>     i32* [[SRC]], i64 3
>     +; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
>     +; CHECK-NEXT:    [[SHL5:%.*]] = shl i32 [[TMP2]], 2
>       ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32,
>     i32* [[DST]], i64 3
>     -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>*
>     [[TMP0]], align 4
>     -; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], <i32 0, i32
>     1, i32 2, i32 3>
>     -; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
>     -; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
>     +; CHECK-NEXT:    store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
>     +; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
>     +; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
>     +; CHECK-NEXT:    store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
>       ; CHECK-NEXT:    ret void
>       ;
>       entry:
>     @@ -421,16 +453,22 @@ define void @add1f(float* noalias %dst,
>       ; CHECK-LABEL: @add1f(
>       ; CHECK-NEXT:  entry:
>       ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds
>     float, float* [[SRC:%.*]], i64 1
>     +; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds
>     float, float* [[DST:%.*]], i64 1
>     +; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds
>     float, float* [[SRC]], i64 2
>     +; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]],
>     align 4
>     +; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
>       ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds
>     float, float* [[DST]], i64 2
>     +; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds
>     float, float* [[SRC]], i64 3
>     +; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]],
>     align 4
>     +; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
>       ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds
>     float, float* [[DST]], i64 3
>     -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>*
>     [[TMP0]], align 4
>     -; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float
>     0.000000e+00, float 1.000000e+00, float 2.000000e+00, float
>     3.000000e+00>, [[TMP1]]
>     -; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
>     -; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]],
>     align 4
>     +; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
>     +; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]],
>     align 4
>     +; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
>     +; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
>       ; CHECK-NEXT:    ret void
>       ;
>       entry:
>     @@ -458,16 +496,22 @@ define void @sub0f(float* noalias %dst,
>       ; CHECK-LABEL: @sub0f(
>       ; CHECK-NEXT:  entry:
>       ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds
>     float, float* [[SRC:%.*]], i64 1
>     +; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
>     +; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
>       ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds
>     float, float* [[DST:%.*]], i64 1
>     +; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds
>     float, float* [[SRC]], i64 2
>     +; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]],
>     align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds
>     float, float* [[DST]], i64 2
>     +; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds
>     float, float* [[SRC]], i64 3
>     +; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]],
>     align 4
>     +; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
>       ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds
>     float, float* [[DST]], i64 3
>     -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>*
>     [[TMP0]], align 4
>     -; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float
>     -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float
>     -3.000000e+00>, [[TMP1]]
>     -; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
>     -; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]],
>     align 4
>     +; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
>     +; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]],
>     align 4
>     +; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00
>     +; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
>       ; CHECK-NEXT:    ret void
>       ;
>       entry:
>     @@ -571,18 +615,22 @@ define void @addsub0f(float* noalias %ds
>       ; CHECK-LABEL: @addsub0f(
>       ; CHECK-NEXT:  entry:
>       ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds
>     float, float* [[SRC:%.*]], i64 1
>     +; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
>     +; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
>       ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds
>     float, float* [[DST:%.*]], i64 1
>     +; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds
>     float, float* [[SRC]], i64 2
>     +; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]],
>     align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds
>     float, float* [[DST]], i64 2
>     +; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds
>     float, float* [[SRC]], i64 3
>     +; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]],
>     align 4
>     +; CHECK-NEXT:    [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
>       ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds
>     float, float* [[DST]], i64 3
>     -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>*
>     [[TMP0]], align 4
>     -; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]],
>     <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float
>     -3.000000e+00>
>     -; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]],
>     <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float
>     -3.000000e+00>
>     -; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]],
>     <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
>     -; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
>     -; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]],
>     align 4
>     +; CHECK-NEXT:    store float [[SUB5]], float* [[INCDEC_PTR3]], align 4
>     +; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]],
>     align 4
>     +; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
>     +; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
>       ; CHECK-NEXT:    ret void
>       ;
>       entry:
>     @@ -610,18 +658,22 @@ define void @addsub1f(float* noalias %ds
>       ; CHECK-LABEL: @addsub1f(
>       ; CHECK-NEXT:  entry:
>       ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds
>     float, float* [[SRC:%.*]], i64 1
>     +; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
>     +; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
>       ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds
>     float, float* [[DST:%.*]], i64 1
>     +; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds
>     float, float* [[SRC]], i64 2
>     +; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]],
>     align 4
>     +; CHECK-NEXT:    [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
>       ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds
>     float, float* [[DST]], i64 2
>     +; CHECK-NEXT:    store float [[SUB1]], float* [[INCDEC_PTR1]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds
>     float, float* [[SRC]], i64 3
>     +; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]],
>     align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds
>     float, float* [[DST]], i64 3
>     -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>*
>     [[TMP0]], align 4
>     -; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]],
>     <float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float
>     -3.000000e+00>
>     -; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]],
>     <float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float
>     -3.000000e+00>
>     -; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]],
>     <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
>     -; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
>     -; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]],
>     align 4
>     +; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR3]], align 4
>     +; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]],
>     align 4
>     +; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
>     +; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
>       ; CHECK-NEXT:    ret void
>       ;
>       entry:
>     @@ -649,16 +701,22 @@ define void @mulf(float* noalias %dst, f
>       ; CHECK-LABEL: @mulf(
>       ; CHECK-NEXT:  entry:
>       ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds
>     float, float* [[SRC:%.*]], i64 1
>     +; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
>     +; CHECK-NEXT:    [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
>       ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds
>     float, float* [[DST:%.*]], i64 1
>     +; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds
>     float, float* [[SRC]], i64 2
>     +; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]],
>     align 4
>     +; CHECK-NEXT:    [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
>       ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds
>     float, float* [[DST]], i64 2
>     +; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds
>     float, float* [[SRC]], i64 3
>     +; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]],
>     align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds
>     float, float* [[DST]], i64 3
>     -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>*
>     [[TMP0]], align 4
>     -; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> <float
>     2.570000e+02, float -3.000000e+00, float 1.000000e+00, float
>     -9.000000e+00>, [[TMP1]]
>     -; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
>     -; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]],
>     align 4
>     +; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
>     +; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]],
>     align 4
>     +; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
>     +; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
>       ; CHECK-NEXT:    ret void
>       ;
>       entry:
>     @@ -767,16 +825,22 @@ define void @sub0fn(float* noalias %dst,
>       ; CHECK-LABEL: @sub0fn(
>       ; CHECK-NEXT:  entry:
>       ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds
>     float, float* [[SRC:%.*]], i64 1
>     +; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
>     +; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
>       ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds
>     float, float* [[DST:%.*]], i64 1
>     +; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds
>     float, float* [[SRC]], i64 2
>     +; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]],
>     align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds
>     float, float* [[DST]], i64 2
>     +; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
>       ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds
>     float, float* [[SRC]], i64 3
>     +; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]],
>     align 4
>     +; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00
>       ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds
>     float, float* [[DST]], i64 3
>     -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
>     -; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>*
>     [[TMP0]], align 4
>     -; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float
>     -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float
>     -3.000000e+00>, [[TMP1]]
>     -; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
>     -; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]],
>     align 4
>     +; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
>     +; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]],
>     align 4
>     +; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00
>     +; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
>       ; CHECK-NEXT:    ret void
>       ;
>       entry:
> 
> 
>     _______________________________________________
>     llvm-commits mailing list
>     llvm-commits at lists.llvm.org <mailto:llvm-commits at lists.llvm.org>
>     http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>     <http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits>
> 
> 
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>