[llvm] [SLP] SLP's copyable elements based upon Main/Alt operations. (PR #124242)

Dinar Temirbulatov via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 19 14:10:39 PDT 2025


https://github.com/dtemirbulatov updated https://github.com/llvm/llvm-project/pull/124242

>From bde8c3adbbfd296e459601113acd112ce786d5e7 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at gmail.com>
Date: Sun, 23 Feb 2025 02:48:12 +0000
Subject: [PATCH 1/9] [SLP] SLP's copyable elements based upon Main/Alt
 operations.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 830 +++++++++++++++---
 .../X86/vect_copyable_in_binops.ll            | 723 ++++++++++++---
 2 files changed, 1269 insertions(+), 284 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bf256d82ae17d..5225eb2b2eefa 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -201,6 +201,10 @@ static cl::opt<bool> VectorizeNonPowerOf2(
     "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
     cl::desc("Try to vectorize with non-power-of-2 number of elements."));
 
+static cl::opt<bool>
+    VectorizeCopyable("slp-vectorize-copyable", cl::init(false), cl::Hidden,
+                      cl::desc("Try to vectorize with copyable elements."));
+
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
@@ -594,6 +598,40 @@ static std::optional<unsigned> getElementIndex(const Value *Inst,
   return Index;
 }
 
+/// Checks if the \p Opcode can be considered as an operand of a (possibly)
+/// binary operation \p I.
+/// \returns The code of the binary operation of instruction \p I if the
+/// instruction with \p Opcode can be considered as an operand of \p I with the
+/// default value.
+static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) {
+  if (Opcode != Instruction::PHI && Opcode != Instruction::Invoke &&
+      (I->getOpcode() == Instruction::Add ||
+       I->getOpcode() == Instruction::And ||
+       I->getOpcode() == Instruction::AShr ||
+       I->getOpcode() == Instruction::BitCast ||
+       I->getOpcode() == Instruction::Call ||
+       // Issue with scheduling with isVectorLikeInstWithConstOps
+       // operations.
+       // I->getOpcode() == Instruction::ExtractElement ||
+       // I->getOpcode() == Instruction::ExtractValue ||
+       I->getOpcode() == Instruction::ICmp ||
+       I->getOpcode() == Instruction::Load ||
+       I->getOpcode() == Instruction::LShr ||
+       I->getOpcode() == Instruction::Mul ||
+       I->getOpcode() == Instruction::Or ||
+       I->getOpcode() == Instruction::PtrToInt ||
+       I->getOpcode() == Instruction::Select ||
+       I->getOpcode() == Instruction::SExt ||
+       I->getOpcode() == Instruction::Shl ||
+       I->getOpcode() == Instruction::Sub ||
+       I->getOpcode() == Instruction::Trunc ||
+       I->getOpcode() == Instruction::Xor ||
+       I->getOpcode() == Instruction::ZExt ||
+       (isa<FPMathOperator>(I) && cast<FPMathOperator>(I)->isFast())))
+    return I->getOpcode();
+  return 0;
+}
+
 namespace {
 /// Specifies the way the mask should be analyzed for undefs/poisonous elements
 /// in the shuffle mask.
@@ -816,6 +854,9 @@ class InstructionsState {
   Instruction *MainOp = nullptr;
   Instruction *AltOp = nullptr;
 
+  /// True if alterative operation is copy instruction.
+  bool IsAltOpCopy = false;
+
 public:
   Instruction *getMainOp() const {
     assert(valid() && "InstructionsState is invalid.");
@@ -832,9 +873,13 @@ class InstructionsState {
 
   unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
 
+  bool isAltOpCopy() const { return IsAltOpCopy; }
+
   /// Some of the instructions in the list have alternate opcodes.
   bool isAltShuffle() const { return getMainOp() != getAltOp(); }
 
+  void setAltOpCopy(bool Val) { IsAltOpCopy = Val; }
+
   bool isOpcodeOrAlt(Instruction *I) const {
     unsigned CheckedOpcode = I->getOpcode();
     return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
@@ -853,6 +898,16 @@ class InstructionsState {
 
 } // end anonymous namespace
 
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(const InstructionsState &S, Value *Op) {
+  auto *I = dyn_cast<Instruction>(Op);
+  if (I && S.isOpcodeOrAlt(I))
+    return Op;
+  return S.getMainOp();
+}
+
 /// \returns true if \p Opcode is allowed as part of the main/alternate
 /// instruction for SLP vectorization.
 ///
@@ -865,6 +920,15 @@ static bool isValidForAlternation(unsigned Opcode) {
   return true;
 }
 
+// Check for inner dependencies, we could not support such depenedies if it
+// comes from a main operaion, only from alternative or for now we ignore
+// alternative operations depenedies to any alternative.
+static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
+                                  const InstructionsState &S);
+
+// Determine that the vector could be vectorized with copyable elements.
+static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt);
+
 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                                        const TargetLibraryInfo &TLI);
 
@@ -917,19 +981,51 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     return InstructionsState::invalid();
 
   Instruction *MainOp = cast<Instruction>(*It);
+  Instruction *AltOp = MainOp;
+  unsigned Opcode = MainOp->getOpcode();
+  unsigned AltOpcode = Opcode;
+  for (Value *V : iterator_range(It + 1, VL.end())) {
+    Instruction *Inst = dyn_cast<Instruction>(V);
+    if (!Inst)
+      continue;
+    unsigned VOpcode = Inst->getOpcode();
+    if (Inst && AltOpcode == Opcode && !isa<PHINode>(Inst) &&
+        VOpcode != Opcode && isValidForAlternation(VOpcode)) {
+      AltOpcode = VOpcode;
+      AltOp = Inst;
+      break;
+    }
+  }
   unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
   if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
       (VL.size() == 2 && InstCnt < 2))
     return InstructionsState::invalid();
-
-  bool IsCastOp = isa<CastInst>(MainOp);
   bool IsBinOp = isa<BinaryOperator>(MainOp);
+  bool IsCopyable = false;
+  if (MainOp && AltOp && MainOp != AltOp) {
+    if (!IsBinOp && isa<BinaryOperator>(AltOp) && !isa<PHINode>(MainOp)) {
+      std::swap(MainOp, AltOp);
+      std::swap(AltOpcode, Opcode);
+      IsBinOp = true;
+    }
+    IsCopyable = VectorizeCopyable && isCopyableOp(VL, MainOp, AltOp);
+    if (IsCopyable && isa<CmpInst>(AltOp)) {
+      Type *Ty0 = MainOp->getOperand(0)->getType();
+      Type *Ty1 = AltOp->getOperand(0)->getType();
+      if (Ty0 != Ty1)
+        return InstructionsState::invalid();
+    } else if (!IsCopyable) {
+      MainOp = cast<Instruction>(*It);
+      AltOp = MainOp;
+      Opcode = MainOp->getOpcode();
+      AltOpcode = Opcode;
+      IsBinOp = isa<BinaryOperator>(MainOp);
+    }
+  }
+  bool IsCastOp = isa<CastInst>(MainOp);
   bool IsCmpOp = isa<CmpInst>(MainOp);
   CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
                                         : CmpInst::BAD_ICMP_PREDICATE;
-  Instruction *AltOp = MainOp;
-  unsigned Opcode = MainOp->getOpcode();
-  unsigned AltOpcode = Opcode;
 
   bool SwappedPredsCompatible = IsCmpOp && [&]() {
     SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
@@ -985,12 +1081,12 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
         AltOp = I;
         continue;
       }
-    } else if (IsCastOp && isa<CastInst>(I)) {
+    } else if ((IsCastOp || IsCopyable) && isa<CastInst>(I)) {
       Value *Op0 = MainOp->getOperand(0);
       Type *Ty0 = Op0->getType();
       Value *Op1 = I->getOperand(0);
       Type *Ty1 = Op1->getType();
-      if (Ty0 == Ty1) {
+      if (Ty0 == Ty1 || IsCopyable) {
         if (InstOpcode == Opcode || InstOpcode == AltOpcode)
           continue;
         if (Opcode == AltOpcode) {
@@ -1002,13 +1098,15 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
           continue;
         }
       }
-    } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
+    } else if (auto *Inst = dyn_cast<CmpInst>(I);
+               Inst && (IsCmpOp || IsCopyable)) {
       auto *BaseInst = cast<CmpInst>(MainOp);
       Type *Ty0 = BaseInst->getOperand(0)->getType();
       Type *Ty1 = Inst->getOperand(0)->getType();
       if (Ty0 == Ty1) {
-        assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
-        assert(InstOpcode == AltOpcode &&
+        assert((IsCopyable || InstOpcode == Opcode) &&
+               "Expected same CmpInst opcode.");
+        assert((IsCopyable || InstOpcode == AltOpcode) &&
                "Alternate instructions are only supported by BinaryOperator "
                "and CastInst.");
         // Check for compatible operands. If the corresponding operands are not
@@ -1039,23 +1137,32 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
             AltPred == CurrentPred || AltPred == SwappedCurrentPred)
           continue;
       }
-    } else if (InstOpcode == Opcode) {
-      assert(InstOpcode == AltOpcode &&
+    } else if (InstOpcode == Opcode ||
+               (IsCopyable && InstOpcode == AltOpcode)) {
+      assert((IsCopyable || InstOpcode == AltOpcode) &&
              "Alternate instructions are only supported by BinaryOperator and "
              "CastInst.");
+      Instruction *Op = MainOp;
+      if (IsCopyable) {
+        if (InstOpcode != Opcode && InstOpcode != AltOpcode) {
+          Op = I;
+        } else if (Opcode != AltOpcode && InstOpcode == AltOpcode) {
+          Op = AltOp;
+        }
+      }
       if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
         if (Gep->getNumOperands() != 2 ||
-            Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
+            Gep->getOperand(0)->getType() != Op->getOperand(0)->getType())
           return InstructionsState::invalid();
       } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
         if (!isVectorLikeInstWithConstOps(EI))
           return InstructionsState::invalid();
       } else if (auto *LI = dyn_cast<LoadInst>(I)) {
-        auto *BaseLI = cast<LoadInst>(MainOp);
+        auto *BaseLI = cast<LoadInst>(Op);
         if (!LI->isSimple() || !BaseLI->isSimple())
           return InstructionsState::invalid();
       } else if (auto *Call = dyn_cast<CallInst>(I)) {
-        auto *CallBase = cast<CallInst>(MainOp);
+        auto *CallBase = cast<CallInst>(Op);
         if (Call->getCalledFunction() != CallBase->getCalledFunction())
           return InstructionsState::invalid();
         if (Call->hasOperandBundles() &&
@@ -1070,13 +1177,14 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
           return InstructionsState::invalid();
         if (!ID) {
           SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
-          if (Mappings.size() != BaseMappings.size() ||
-              Mappings.front().ISA != BaseMappings.front().ISA ||
-              Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
-              Mappings.front().VectorName != BaseMappings.front().VectorName ||
-              Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
-              Mappings.front().Shape.Parameters !=
-                  BaseMappings.front().Shape.Parameters)
+          if (Mappings.size() &&
+              (Mappings.size() != BaseMappings.size() ||
+               Mappings.front().ISA != BaseMappings.front().ISA ||
+               Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
+               Mappings.front().VectorName != BaseMappings.front().VectorName ||
+               Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
+               Mappings.front().Shape.Parameters !=
+                   BaseMappings.front().Shape.Parameters))
             return InstructionsState::invalid();
         }
       }
@@ -1125,6 +1233,54 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
   }
 }
 
+static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
+                                  const InstructionsState &S) {
+  SmallSet<Value *, 4> Ops;
+  SmallSet<Value *, 4> AltOps;
+  unsigned Opcode = S.getOpcode();
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      continue;
+    if (I->getOpcode() == Opcode)
+      Ops.insert(V);
+    else
+      AltOps.insert(V);
+  }
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      continue;
+    for (Use &U : I->operands())
+      if (auto *Op = dyn_cast<Instruction>(U.get()); Op && Ops.contains(Op))
+        return false;
+    if (I->getOpcode() != Opcode) {
+      for (Use &U : I->operands())
+        if (auto *Op = dyn_cast<Instruction>(U.get());
+            Op && AltOps.contains(Op))
+          return false;
+    }
+  }
+  return true;
+}
+
+static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) {
+  if (any_of(VL, IsaPred<PoisonValue>) || Main == Alt ||
+      !isa<BinaryOperator>(Main) || !isa<Instruction>(Alt) ||
+      find_if(VL, IsaPred<PHINode>) != VL.end())
+    return false;
+
+  Instruction *MainOp = cast<Instruction>(Main);
+  Instruction *AltOp = cast<Instruction>(Alt);
+
+  if (isa<BinaryOperator>(MainOp) && !isa<BinaryOperator>(AltOp) &&
+      isValidForAlternation(MainOp->getOpcode()) &&
+      isValidForAlternation(AltOp->getOpcode()) &&
+      tryToRepresentAsInstArg(MainOp->getOpcode(), AltOp) &&
+      tryToRepresentAsInstArg(AltOp->getOpcode(), MainOp))
+    return true;
+  return false;
+}
 /// \returns the AA location that is being access by the instruction.
 static MemoryLocation getLocation(Instruction *I) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -1473,6 +1629,7 @@ class BoUpSLP {
     ScalarToTreeEntries.clear();
     MustGather.clear();
     NonScheduledFirst.clear();
+    CopyableAltOp.clear();
     EntryToLastInstruction.clear();
     LoadEntriesToVectorize.clear();
     IsGraphTransformMode = false;
@@ -2488,8 +2645,16 @@ class BoUpSLP {
           }
           bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
-          OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
-                                 APO, false};
+          Instruction *Inst = cast<Instruction>(VL[Lane]);
+          if (Inst->getOpcode() != MainOp->getOpcode() &&
+              OpIdx > (Inst->getNumOperands() - 1)) {
+            OpsVec[OpIdx][Lane] = {
+                PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
+                false};
+          } else {
+            OpsVec[OpIdx][Lane] = {
+                cast<Instruction>(VL[Lane])->getOperand(OpIdx), APO, false};
+          }
         }
       }
     }
@@ -3416,7 +3581,7 @@ class BoUpSLP {
     }
 
     /// Some of the instructions in the list have alternate opcodes.
-    bool isAltShuffle() const { return S.isAltShuffle(); }
+    bool isAltShuffle() const { return S.isAltShuffle() && !S.isAltOpCopy(); }
 
     bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
 
@@ -3444,6 +3609,8 @@ class BoUpSLP {
 
     unsigned getAltOpcode() const { return S.getAltOpcode(); }
 
+    bool isAltOpCopy() const { return S.isAltOpCopy(); }
+
     bool hasState() const { return S.valid(); }
 
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
@@ -3543,6 +3710,7 @@ class BoUpSLP {
       if (S) {
         dbgs() << "MainOp: " << *S.getMainOp() << "\n";
         dbgs() << "AltOp: " << *S.getAltOp() << "\n";
+        dbgs() << "isAltOpCopy: " << S.isAltOpCopy() << "\n";
       } else {
         dbgs() << "MainOp: NULL\n";
         dbgs() << "AltOp: NULL\n";
@@ -3636,7 +3804,7 @@ class BoUpSLP {
     // for non-power-of-two vectors.
     assert(
         (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
-         ReuseShuffleIndices.empty()) &&
+         S.isAltOpCopy() || ReuseShuffleIndices.empty()) &&
         "Reshuffling scalars not yet supported for nodes with padding");
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
@@ -3660,10 +3828,18 @@ class BoUpSLP {
     }
     if (!Last->isGather()) {
       SmallPtrSet<Value *, 4> Processed;
-      for (Value *V : VL) {
+      unsigned Opcode = S.getOpcode();
+      for (unsigned i = 0; i < VL.size(); ++i) {
+        Value *V = VL[i];
         if (isa<PoisonValue>(V))
           continue;
         auto It = ScalarToTreeEntries.find(V);
+        Instruction *I = dyn_cast<Instruction>(V);
+        bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
+        if (S.isAltOpCopy() && IsAltInst) {
+          CopyableAltOp.insert(V);
+          continue;
+        }
         assert(
             (It == ScalarToTreeEntries.end() ||
              (It->getSecond().size() == 1 && It->getSecond().front() == Last) ||
@@ -3759,13 +3935,15 @@ class BoUpSLP {
   bool areAltOperandsProfitable(const InstructionsState &S,
                                 ArrayRef<Value *> VL) const;
 
+  /// Check that we can represent operations as copyable by looking to
+  /// operations operands.
+  bool canRepresentAsCopyable(const InstructionsState &S, ArrayRef<Value *> VL);
+
   /// Checks if the specified list of the instructions/values can be vectorized
   /// and fills required data before actual scheduling of the instructions.
-  TreeEntry::EntryState
-  getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
-                               bool IsScatterVectorizeUserTE,
-                               OrdersType &CurrentOrder,
-                               SmallVectorImpl<Value *> &PointerOps);
+  TreeEntry::EntryState getScalarsVectorizationState(
+      InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
+      OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps);
 
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -3776,6 +3954,9 @@ class BoUpSLP {
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
+  /// A set op scalars that we are considoring as copyable operations.
+  ValueSet CopyableAltOp;
+
   /// A set of first non-schedulable values.
   ValueSet NonScheduledFirst;
 
@@ -3908,15 +4089,16 @@ class BoUpSLP {
 
     ScheduleData() = default;
 
-    void init(int BlockSchedulingRegionID, Instruction *I) {
+    void init(int BlockSchedulingRegionID, Value *OpVal) {
       FirstInBundle = this;
       NextInBundle = nullptr;
       NextLoadStore = nullptr;
       IsScheduled = false;
       SchedulingRegionID = BlockSchedulingRegionID;
       clearDependencies();
-      Inst = I;
+      OpValue = OpVal;
       TE = nullptr;
+      IsCopy = false;
     }
 
     /// Verify basic self consistency properties
@@ -4029,6 +4211,9 @@ class BoUpSLP {
 
     Instruction *Inst = nullptr;
 
+    /// Opcode of the current instruction in the schedule data.
+    Value *OpValue = nullptr;
+
     /// The TreeEntry that this instruction corresponds to.
     TreeEntry *TE = nullptr;
 
@@ -4076,6 +4261,9 @@ class BoUpSLP {
     /// True if this instruction is scheduled (or considered as scheduled in the
     /// dry-run).
     bool IsScheduled = false;
+
+    /// True if this instruction is a copy.
+    bool IsCopy = false;
   };
 
 #ifndef NDEBUG
@@ -4133,15 +4321,28 @@ class BoUpSLP {
       if (BB != I->getParent())
         // Avoid lookup if can't possibly be in map.
         return nullptr;
-      ScheduleData *SD = ScheduleDataMap.lookup(I);
-      if (SD && isInSchedulingRegion(SD))
-        return SD;
+      return getScheduleData(I, I);
+    }
+
+    ScheduleData *getScheduleData(Value *V) { return getScheduleData(V, V); }
+
+    ScheduleData *getScheduleData(Value *V, Value *Key) {
+      auto I = ScheduleDataMap.find(V);
+      if (I != ScheduleDataMap.end()) {
+        ScheduleData *SD = I->second.lookup(Key);
+        if (SD && isInSchedulingRegion(SD))
+          return SD;
+      }
       return nullptr;
     }
 
-    ScheduleData *getScheduleData(Value *V) {
-      if (auto *I = dyn_cast<Instruction>(V))
-        return getScheduleData(I);
+    ScheduleData *getScheduleData(Value *V, const TreeEntry *E) {
+      auto I = ScheduleDataMap.find(V);
+      if (I == ScheduleDataMap.end())
+        return nullptr;
+      for (auto &P : I->second)
+        if (isInSchedulingRegion(P.second) && P.second->TE == E)
+          return P.second;
       return nullptr;
     }
 
@@ -4158,30 +4359,32 @@ class BoUpSLP {
 
       for (ScheduleData *BundleMember = SD; BundleMember;
            BundleMember = BundleMember->NextInBundle) {
-
         // Handle the def-use chain dependencies.
 
         // Decrement the unscheduled counter and insert to ready list if ready.
-        auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
-          ScheduleData *OpDef = getScheduleData(I);
-          if (OpDef && OpDef->hasValidDependencies() &&
-              OpDef->incrementUnscheduledDeps(-1) == 0) {
-            // There are no more unscheduled dependencies after
-            // decrementing, so we can put the dependent instruction
-            // into the ready list.
-            ScheduleData *DepBundle = OpDef->FirstInBundle;
-            assert(!DepBundle->IsScheduled &&
-                   "already scheduled bundle gets ready");
-            ReadyList.insert(DepBundle);
-            LLVM_DEBUG(dbgs()
-                       << "SLP:    gets ready (def): " << *DepBundle << "\n");
-          }
+        auto &&DecrUnsched = [this, &ReadyList, &BundleMember](Instruction *I) {
+          doForAllOpcodes(I, [&ReadyList, &BundleMember,
+                              &I](ScheduleData *OpDef) {
+            if (OpDef && OpDef->hasValidDependencies() &&
+                BundleMember->Inst != I &&
+                OpDef->incrementUnscheduledDeps(-1) == 0) {
+              // There are no more unscheduled dependencies after
+              // decrementing, so we can put the dependent instruction
+              // into the ready list.
+              ScheduleData *DepBundle = OpDef->FirstInBundle;
+              assert(!DepBundle->IsScheduled &&
+                     "already scheduled bundle gets ready");
+              ReadyList.insert(DepBundle);
+              LLVM_DEBUG(dbgs()
+                         << "SLP:    gets ready (def): " << *DepBundle << "\n");
+            }
+          });
         };
 
         // If BundleMember is a vector bundle, its operands may have been
         // reordered during buildTree(). We therefore need to get its operands
         // through the TreeEntry.
-        if (TreeEntry *TE = BundleMember->TE) {
+        if (TreeEntry *TE = BundleMember->TE; TE && !TE->isAltOpCopy()) {
           // Need to search for the lane since the tree entry can be reordered.
           auto *In = BundleMember->Inst;
           int Lane = std::distance(TE->Scalars.begin(),
@@ -4197,6 +4400,7 @@ class BoUpSLP {
           assert(
               In &&
               (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
+               BundleMember->IsCopy ||
                In->getNumOperands() == TE->getNumOperands()) &&
               "Missed TreeEntry operands?");
 
@@ -4257,7 +4461,8 @@ class BoUpSLP {
                "primary schedule data not in window?");
         assert(isInSchedulingRegion(SD->FirstInBundle) &&
                "entire bundle in window!");
-        SD->verify();
+        (void)SD;
+        doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
       }
 
       for (auto *SD : ReadyInsts) {
@@ -4267,35 +4472,47 @@ class BoUpSLP {
       }
     }
 
+    void doForAllOpcodes(Value *V,
+                         function_ref<void(ScheduleData *SD)> Action) {
+      auto I = ScheduleDataMap.find(V);
+      if (I != ScheduleDataMap.end())
+        for (auto &P : I->second)
+          if (isInSchedulingRegion(P.second))
+            Action(P.second);
+    }
+
     /// Put all instructions into the ReadyList which are ready for scheduling.
     template <typename ReadyListType>
     void initialFillReadyList(ReadyListType &ReadyList) {
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
-        ScheduleData *SD = getScheduleData(I);
-        if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
-            SD->isReady()) {
-          ReadyList.insert(SD);
-          LLVM_DEBUG(dbgs()
-                     << "SLP:    initially in ready list: " << *SD << "\n");
-        }
+        doForAllOpcodes(I, [&](ScheduleData *SD) {
+          if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
+              SD->isReady()) {
+            ReadyList.insert(SD);
+            LLVM_DEBUG(dbgs()
+                       << "SLP:    initially in ready list: " << *SD << "\n");
+          }
+        });
       }
     }
 
     /// Build a bundle from the ScheduleData nodes corresponding to the
     /// scalar instruction for each lane.
-    ScheduleData *buildBundle(ArrayRef<Value *> VL);
+    ScheduleData *buildBundle(ArrayRef<Value *> VL, const InstructionsState &S,
+                              bool &ReSchedule);
 
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
     /// \returns the scheduling bundle. The returned Optional value is not
     /// std::nullopt if \p VL is allowed to be scheduled.
-    std::optional<ScheduleData *>
-    tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
-                      const InstructionsState &S);
+    std::optional<ScheduleData *> tryScheduleBundle(ArrayRef<Value *> VL,
+                                                    BoUpSLP *SLP,
+                                                    const InstructionsState &S,
+                                                    bool AnyCopies);
 
     /// Un-bundles a group of instructions.
-    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+    void cancelScheduling(ArrayRef<Value *> VL, ScheduleData *Bundle);
 
     /// Allocates schedule data chunk.
     ScheduleData *allocateScheduleDataChunks();
@@ -4333,7 +4550,7 @@ class BoUpSLP {
     /// Attaches ScheduleData to Instruction.
     /// Note that the mapping survives during all vectorization iterations, i.e.
     /// ScheduleData structures are recycled.
-    DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
+    DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>> ScheduleDataMap;
 
     /// The ready-list for scheduling (only used for the dry-run).
     SetVector<ScheduleData *> ReadyInsts;
@@ -6330,6 +6547,8 @@ bool BoUpSLP::canReorderOperands(
         }))
       continue;
     if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
+      if (TE->isAltOpCopy())
+        return false;
       // Add the node to the list of the ordered nodes with the identity
       // order.
       Edges.emplace_back(I, TE);
@@ -6732,8 +6951,11 @@ void BoUpSLP::buildExternalUses(
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
-      if (!isa<Instruction>(Scalar))
+      if (!isa<Instruction>(Scalar) ||
+          (Entry->isAltOpCopy() &&
+           cast<Instruction>(Scalar)->getOpcode() != Entry->getOpcode()))
         continue;
+
       // All uses must be replaced already? No need to do it again.
       auto It = ScalarToExtUses.find(Scalar);
       if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
@@ -7687,6 +7909,52 @@ static bool isAlternateInstruction(const Instruction *I,
                                    const Instruction *AltOp,
                                    const TargetLibraryInfo &TLI);
 
+bool BoUpSLP::canRepresentAsCopyable(const InstructionsState &S,
+                                     ArrayRef<Value *> VL) {
+  unsigned Opcode0 = S.getOpcode();
+  unsigned Opcode1 = S.getAltOpcode();
+  DenseMap<unsigned, unsigned> AltOps;
+  SmallVector<unsigned> MainAltOps;
+  unsigned Operand;
+
+  if (!checkCopyableInnerDep(VL, S))
+    return false;
+  if (isCopyableOp(VL, S.getMainOp(), S.getAltOp()))
+    return true;
+  if ((isa<BinaryOperator>(S.getMainOp()) && Opcode0 == Opcode1) ||
+      (!isValidForAlternation(Opcode0) || !isValidForAlternation(Opcode1)) ||
+      !tryToRepresentAsInstArg(S.getOpcode(), S.getAltOp()) ||
+      !tryToRepresentAsInstArg(S.getAltOpcode(), S.getMainOp()))
+    return false;
+  for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+    Instruction *Inst = dyn_cast<Instruction>(VL[I]);
+    if (!Inst)
+      return false;
+    if (Inst->getOpcode() == Opcode0) {
+      for (unsigned Op : seq<unsigned>(0, S.getMainOp()->getNumOperands())) {
+        Instruction *Inst1 = dyn_cast<Instruction>(Inst->getOperand(Op));
+        if (!Inst1)
+          continue;
+        if (Inst1->getOpcode() == Opcode0)
+          return false;
+        if (AltOps.contains(I) || (AltOps.size() && Op != Operand))
+          return false;
+        if (Inst1->getOpcode() == Opcode1) {
+          if (!AltOps.size())
+            Operand = Op;
+          AltOps[I] = Op;
+        }
+      }
+    } else if (Inst->getOpcode() == Opcode1) {
+      MainAltOps.push_back(I);
+    }
+  }
+  if (AltOps.size() > 0 && MainAltOps.size() > 0)
+    return true;
+
+  return false;
+}
+
 bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
                                        ArrayRef<Value *> VL) const {
   unsigned Opcode0 = S.getOpcode();
@@ -7697,6 +7965,8 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
                            Opcode0, Opcode1, OpcodeMask))
     return true;
   SmallVector<ValueList> Operands;
+  if (S.getMainOp()->getNumOperands() != S.getAltOp()->getNumOperands())
+    return false;
   for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
     Operands.emplace_back();
     // Prepare the operand vector.
@@ -7861,9 +8131,8 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
 }
 
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
-    const InstructionsState &S, ArrayRef<Value *> VL,
-    bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
-    SmallVectorImpl<Value *> &PointerOps) {
+    InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
+    OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) {
   assert(S.getMainOp() &&
          "Expected instructions with same/alternate opcodes only.");
 
@@ -8218,6 +8487,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     return TreeEntry::Vectorize;
   }
   case Instruction::ShuffleVector: {
+    if (VectorizeCopyable && isCopyableOp(VL, S.getMainOp(), S.getAltOp()) &&
+        checkCopyableInnerDep(VL, S)) {
+      S.setAltOpCopy(true);
+      return TreeEntry::Vectorize;
+    }
     if (!S.isAltShuffle()) {
       // REVEC can support non alternate shuffle.
       if (SLPReVec && getShufflevectorNumGroups(VL))
@@ -8234,6 +8508,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
              "the whole alt sequence is not profitable.\n");
       return TreeEntry::NeedToGather;
     }
+    if (VectorizeCopyable && canRepresentAsCopyable(S, VL))
+      S.setAltOpCopy(true);
 
     return TreeEntry::Vectorize;
   }
@@ -8516,6 +8792,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
     auto *I1 = cast<Instruction>(VL.front());
     auto *I2 = cast<Instruction>(VL.back());
+    if (I1->getNumOperands() != I2->getNumOperands())
+      return true;
     for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
       Candidates.emplace_back().emplace_back(I1->getOperand(Op),
                                              I2->getOperand(Op));
@@ -8656,7 +8934,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   SmallVector<Value *> PointerOps;
   TreeEntry::EntryState State = getScalarsVectorizationState(
       S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
-  if (State == TreeEntry::NeedToGather) {
+  if (S.isAltOpCopy()) {
+    for (Value *V : VL) {
+      Instruction *I = dyn_cast<Instruction>(V);
+      if (!I)
+        continue;
+      if (I->getOpcode() == S.getAltOpcode() && CopyableAltOp.contains(V)) {
+        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndices);
+        return;
+      }
+    }
+  }
+  if (State == TreeEntry::NeedToGather ||
+      (S.isAltOpCopy() && !has_single_bit(UniqueValues.size()))) {
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
                  ReuseShuffleIndices);
     return;
@@ -8666,18 +8957,25 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   if (!BSRef)
     BSRef = std::make_unique<BlockScheduling>(BB);
 
-  BlockScheduling &BS = *BSRef;
+  bool AnyCopies = false;
+  for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
+    if (VectorizableTree[Id]->isAltOpCopy())
+      AnyCopies = true;
+  }
 
-  std::optional<ScheduleData *> Bundle =
-      BS.tryScheduleBundle(UniqueValues, this, S);
+  BlockScheduling &BS = *BSRef;
+  std::optional<ScheduleData *> Bundle;
+  Bundle = BS.tryScheduleBundle(UniqueValues, this, S, AnyCopies);
 #ifdef EXPENSIVE_CHECKS
   // Make sure we didn't break any internal invariants
   BS.verify();
 #endif
-  if (!Bundle) {
+  if (!Bundle || (S.isAltOpCopy() && !Bundle.value())) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
     assert((!BS.getScheduleData(VL0) ||
-            !BS.getScheduleData(VL0)->isPartOfBundle()) &&
+            !BS.getScheduleData(VL0)->isPartOfBundle() || S.isAltOpCopy() ||
+            (BS.getScheduleData(VL0)->TE &&
+             BS.getScheduleData(VL0)->TE->isAltOpCopy())) &&
            "tryScheduleBundle should cancelScheduling on failure");
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
                  ReuseShuffleIndices);
@@ -9078,7 +9376,73 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
             TE->dump());
       }
-
+      if (S.isAltOpCopy() && !isCopyableOp(VL, S.getMainOp(), S.getAltOp())) {
+        ValueList Left, Right;
+        unsigned Opcode0 = S.getOpcode();
+        unsigned Opcode1 = S.getAltOpcode();
+        unsigned Operand;
+        bool IsOperandSet = false;
+        ValueList newMainVL;
+        ValueList newVL;
+        for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+          Instruction *Inst = cast<Instruction>(VL[I]);
+          if (Inst->getOpcode() == Opcode0) {
+            newMainVL.push_back(VL[I]);
+            unsigned Op = 0;
+            Instruction *Inst1 = dyn_cast<Instruction>(Inst->getOperand(Op));
+            if (!Inst1) {
+              newVL.push_back(Inst->getOperand(Op));
+              continue;
+            }
+            if (IsOperandSet && Op != Operand)
+              return;
+            if (Inst1->getOpcode() == Opcode1) {
+              if (!IsOperandSet) {
+                Operand = Op;
+                IsOperandSet = true;
+              }
+            }
+            newVL.push_back(Inst1);
+          } else if (Inst->getOpcode() == Opcode1) {
+            newVL.push_back(Inst);
+          }
+        }
+        VLOperands Ops(VL, S, *this);
+        Left = Ops.getVL(0);
+        Right = Ops.getVL(1);
+        for (unsigned I = 0, VF = VL.size(); I < VF; ++I)
+          if ((cast<Instruction>(VL[I]))->getOpcode() != Opcode0) {
+            Right[I] = ConstantExpr::getBinOpIdentity(
+                Opcode0, Right[0]->getType(), true);
+          }
+        TE->setOperand(0, newVL);
+        TE->setOperand(1, Right);
+        buildTree_rec(newVL, Depth + 1, {TE, 0});
+        buildTree_rec(Right, Depth + 1, {TE, 1});
+        return;
+      } else if (S.isAltOpCopy()) {
+        ValueList Left, Right;
+        unsigned Opcode0 = S.getOpcode();
+        VLOperands Ops(VL, S, *this);
+        Left = Ops.getVL(0);
+        Right = Ops.getVL(1);
+        ValueList Left_new, Right_new;
+        for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+          if ((cast<Instruction>(VL[I]))->getOpcode() != Opcode0) {
+            Left_new.push_back(VL[I]);
+            Right_new.push_back(ConstantExpr::getBinOpIdentity(
+                Opcode0, S.getMainOp()->getType(), true));
+          } else {
+            Left_new.push_back(Left[I]);
+            Right_new.push_back(Right[I]);
+          }
+        }
+        TE->setOperand(0, Left_new);
+        TE->setOperand(1, Right_new);
+        buildTree_rec(Left_new, Depth + 1, {TE, 0});
+        buildTree_rec(Right_new, Depth + 1, {TE, 1});
+        return;
+      }
       // Reorder operands if reordering would enable vectorization.
       auto *CI = dyn_cast<CmpInst>(VL0);
       if (CI && any_of(VL, [](Value *V) {
@@ -11344,8 +11708,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
            E->getMainOp()->getType()->isPointerTy())) &&
          "Invalid VL");
   Instruction *VL0 = E->getMainOp();
-  unsigned ShuffleOrOp =
-      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+  unsigned ShuffleOrOp = (E->isAltShuffle() && !E->isAltOpCopy())
+                             ? (unsigned)Instruction::ShuffleVector
+                             : E->getOpcode();
   if (E->CombinedOp != TreeEntry::NotCombinedOp)
     ShuffleOrOp = E->CombinedOp;
   SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
@@ -11992,7 +12357,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                Instruction::isBinaryOp(E->getAltOpcode())) ||
               (Instruction::isCast(E->getOpcode()) &&
                Instruction::isCast(E->getAltOpcode())) ||
-              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
+              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp())) ||
+              E->isAltOpCopy()) &&
              "Invalid Shuffle Vector Operand");
     // Try to find the previous shuffle node with the same operands and same
     // main/alternate ops.
@@ -12780,6 +13146,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
         std::optional<unsigned> InsertIdx = getElementIndex(VU);
         if (InsertIdx) {
           const TreeEntry *ScalarTE = &EU.E;
+          if (!ScalarTE)
+            continue;
           auto *It = find_if(
               ShuffledInserts,
               [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
@@ -12862,8 +13230,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
                                   EU.Lane, EU.Scalar, ScalarUserAndIdx);
     }
     // Leave the scalar instructions as is if they are cheaper than extracts.
-    if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
-        Entry->getOpcode() == Instruction::Load) {
+    if (Entry &&
+        (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
+         Entry->getOpcode() == Instruction::Load)) {
       // Checks if the user of the external scalar is phi in loop body.
       auto IsPhiInLoop = [&](const ExternalUser &U) {
         if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
@@ -14128,13 +14497,20 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
   // VL.back() and iterate over schedule data until we reach the end of the
   // bundle. The end of the bundle is marked by null ScheduleData.
   if (BlocksSchedules.count(BB) && !E->isGather()) {
-    Value *V = E->isOneOf(E->Scalars.back());
+    Value *V = E->getMainOp();
     if (doesNotNeedToBeScheduled(V))
       V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
-    auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
-    if (Bundle && Bundle->isPartOfBundle())
+    auto *Bundle = BlocksSchedules[BB]->getScheduleData(V, E);
+    if (Bundle && Bundle->isPartOfBundle()) {
+      if (any_of(E->Scalars, [&](Value *V) {
+            return (!doesNotNeedToBeScheduled(V) && CopyableAltOp.contains(V));
+          }))
+        Bundle = Bundle->FirstInBundle;
       for (; Bundle; Bundle = Bundle->NextInBundle)
-        Res = Bundle->Inst;
+        if (!CopyableAltOp.contains(Bundle->Inst) &&
+            !doesNotNeedToBeScheduled(Bundle->Inst))
+          Res = Bundle->Inst;
+    }
   }
 
   // LastInst can still be null at this point if there's either not an entry
@@ -14876,8 +15252,12 @@ BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
                                      const InstructionsState &S) {
   if (!S)
     return nullptr;
-  if (TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL);
-      VE && VE->UserTreeIndex.UserTE == E &&
+  TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL);
+  if (VE && VE->UserTreeIndex.UserTE == E &&
+      VE->UserTreeIndex.EdgeIdx == NodeIdx)
+    return VE;
+  VE = getSameValuesTreeEntry(S.getAltOp(), VL);
+  if (VE && VE->isAltOpCopy() && VE->UserTreeIndex.UserTE == E &&
       VE->UserTreeIndex.EdgeIdx == NodeIdx)
     return VE;
   return nullptr;
@@ -16594,6 +16974,8 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
     if (User && !is_contained(Scalar->users(), User))
       continue;
     const TreeEntry *E = &ExternalUse.E;
+    if (!E && CopyableAltOp.contains(Scalar))
+      continue;
     assert(E && "Invalid scalar");
     assert(!E->isGather() && "Extracting from a gather list");
     // Non-instruction pointers are not deleted, just skip them.
@@ -16985,6 +17367,9 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
         continue;
       if (isa<PoisonValue>(Scalar))
         continue;
+      if (Entry->isAltOpCopy() &&
+          cast<Instruction>(Scalar)->getOpcode() != Entry->getOpcode())
+        continue;
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
@@ -17221,14 +17606,59 @@ void BoUpSLP::optimizeGatherSequence() {
   GatherShuffleExtractSeq.clear();
 }
 
-BoUpSLP::ScheduleData *
-BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
+BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::buildBundle(
+    ArrayRef<Value *> VL, const InstructionsState &S, bool &ReSchedule) {
   ScheduleData *Bundle = nullptr;
   ScheduleData *PrevInBundle = nullptr;
+  unsigned Opcode = S.getOpcode();
+  ValueList Keys;
+
   for (Value *V : VL) {
+    auto *SD = getScheduleData(V);
+    bool FoundKey = false;
+    if (SD && !SD->isPartOfBundle()) {
+      Keys.push_back(V);
+      continue;
+    }
+    for (Value *Key : VL) {
+      SD = getScheduleData(V, Key);
+      if (SD && SD->isPartOfBundle()) {
+        ReSchedule = true;
+      } else if (!SD || !SD->isPartOfBundle()) {
+        FoundKey = true;
+        Keys.push_back(Key);
+        break;
+      }
+    }
+    if (!FoundKey) {
+      for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E;
+           ++It) {
+        Value *Key = &*It;
+        if (!Key)
+          continue;
+        SD = getScheduleData(V, Key);
+        if (!SD || !SD->isPartOfBundle()) {
+          FoundKey = true;
+          Keys.push_back(Key);
+          break;
+        }
+      }
+    }
+  }
+  for (auto [V, Key] : zip(VL, Keys)) {
     if (doesNotNeedToBeScheduled(V))
       continue;
-    ScheduleData *BundleMember = getScheduleData(V);
+    Instruction *I = dyn_cast<Instruction>(V);
+    bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
+    ScheduleData *BundleMember = getScheduleData(V, Key);
+    if (V != Key) {
+      ScheduleData *SD = allocateScheduleDataChunks();
+      Instruction *I = dyn_cast<Instruction>(V);
+      SD->Inst = I;
+      SD->init(SchedulingRegionID, Key);
+      ScheduleDataMap[I][Key] = SD;
+      BundleMember = getScheduleData(V, Key);
+    }
     assert(BundleMember &&
            "no ScheduleData for bundle member "
            "(maybe not in same basic block)");
@@ -17242,6 +17672,8 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
 
     // Group the instructions to a bundle.
     BundleMember->FirstInBundle = Bundle;
+    if (S.isAltOpCopy() && IsAltInst)
+      BundleMember->IsCopy = true;
     PrevInBundle = BundleMember;
   }
   assert(Bundle && "Failed to find schedule bundle");
@@ -17252,7 +17684,8 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
 // and schedules instructions until the bundle gets ready.
 std::optional<BoUpSLP::ScheduleData *>
 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
-                                            const InstructionsState &S) {
+                                            const InstructionsState &S,
+                                            bool AnyCopies) {
   // No need to schedule PHIs, insertelement, extractelement and extractvalue
   // instructions.
   if (isa<PHINode>(S.getMainOp()) ||
@@ -17261,19 +17694,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
   // Initialize the instruction bundle.
   Instruction *OldScheduleEnd = ScheduleEnd;
+  bool IsAltOpCopy = S.isAltOpCopy();
   LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.getMainOp() << "\n");
 
-  auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
-                                                         ScheduleData *Bundle) {
+  auto TryScheduleBundleImpl = [this, OldScheduleEnd, IsAltOpCopy, AnyCopies,
+                                SLP](bool ReSchedule, ScheduleData *Bundle) {
     // The scheduling region got new instructions at the lower end (or it is a
     // new region for the first bundle). This makes it necessary to
     // recalculate all dependencies.
     // It is seldom that this needs to be done a second time after adding the
     // initial bundle to the region.
-    if (ScheduleEnd != OldScheduleEnd) {
+    if (ScheduleEnd != OldScheduleEnd || IsAltOpCopy || AnyCopies) {
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
-        if (ScheduleData *SD = getScheduleData(I))
-          SD->clearDependencies();
+        doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
       ReSchedule = true;
     }
     if (Bundle) {
@@ -17339,24 +17772,34 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     ReSchedule = true;
   }
 
-  auto *Bundle = buildBundle(VL);
+  auto *Bundle = buildBundle(VL, S, ReSchedule);
+  if (!Bundle)
+    return std::nullopt;
   TryScheduleBundleImpl(ReSchedule, Bundle);
   if (!Bundle->isReady()) {
-    cancelScheduling(VL, S.getMainOp());
+    cancelScheduling(VL, Bundle);
+    // In case we have any copyable element then we have to clear
+    // all dependencies, since all values were calculated for
+    // the vectorized bundles with copies.
+    if (AnyCopies || IsAltOpCopy) {
+      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
+        doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
+      resetSchedule();
+    }
     return std::nullopt;
   }
   return Bundle;
 }
 
 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
-                                                Value *OpValue) {
-  if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
-      doesNotNeedToSchedule(VL))
+                                                ScheduleData *Bundle) {
+  if (isa<PHINode>(VL.front()) || isVectorLikeInstWithConstOps(VL.front()) ||
+      doesNotNeedToSchedule(VL) || !Bundle)
     return;
 
-  if (doesNotNeedToBeScheduled(OpValue))
-    OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
-  ScheduleData *Bundle = getScheduleData(OpValue);
+  if (Bundle->FirstInBundle)
+    Bundle = Bundle->FirstInBundle;
+
   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
@@ -17376,9 +17819,17 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
     ScheduleData *Next = BundleMember->NextInBundle;
     BundleMember->NextInBundle = nullptr;
     BundleMember->TE = nullptr;
+    BundleMember->IsCopy = false;
     if (BundleMember->unscheduledDepsInBundle() == 0) {
       ReadyInsts.insert(BundleMember);
     }
+    auto I = ScheduleDataMap.find(BundleMember->Inst);
+    if (I != ScheduleDataMap.end()) {
+      for (auto &SD : I->second) {
+        if (SD.second == BundleMember && SD.first != BundleMember->Inst)
+          ScheduleDataMap[BundleMember->Inst].erase(SD.first);
+      }
+    }
     BundleMember = Next;
   }
 }
@@ -17394,19 +17845,34 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
 
 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
     Value *V, const InstructionsState &S) {
+  if (getScheduleData(V, S.getMainOp()))
+    return true;
   Instruction *I = dyn_cast<Instruction>(V);
   assert(I && "bundle member must be an instruction");
   assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
          !doesNotNeedToBeScheduled(I) &&
          "phi nodes/insertelements/extractelements/extractvalues don't need to "
          "be scheduled");
-  if (getScheduleData(I))
+  auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
+    ScheduleData *ISD = getScheduleData(I);
+    if (!ISD)
+      return false;
+    assert(isInSchedulingRegion(ISD) &&
+           "ScheduleData not in scheduling region");
+    ScheduleData *SD = allocateScheduleDataChunks();
+    SD->Inst = I;
+    SD->init(SchedulingRegionID, S.getMainOp());
+    return true;
+  };
+  if (CheckScheduleForI(I))
     return true;
   if (!ScheduleStart) {
     // It's the first instruction in the new region.
     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
     ScheduleStart = I;
     ScheduleEnd = I->getNextNode();
+    if (isOneOf(S, I) != I)
+      CheckScheduleForI(I);
     assert(ScheduleEnd && "tried to vectorize a terminator?");
     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
     return true;
@@ -17445,6 +17911,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
            "Instruction is in wrong basic block.");
     initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
     ScheduleStart = I;
+    if (isOneOf(S, I) != I)
+      CheckScheduleForI(I);
     LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
                       << "\n");
     return true;
@@ -17457,6 +17925,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
   initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
                    nullptr);
   ScheduleEnd = I->getNextNode();
+  if (isOneOf(S, I) != I)
+    CheckScheduleForI(I);
   assert(ScheduleEnd && "tried to vectorize a terminator?");
   LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
   return true;
@@ -17471,10 +17941,14 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
     // No need to allocate data for non-schedulable instructions.
     if (doesNotNeedToBeScheduled(I))
       continue;
-    ScheduleData *SD = ScheduleDataMap.lookup(I);
+    ScheduleData *SD = nullptr;
+    auto It = ScheduleDataMap.find(I);
+    if (It != ScheduleDataMap.end())
+      SD = It->second.lookup(I);
     if (!SD) {
       SD = allocateScheduleDataChunks();
-      ScheduleDataMap[I] = SD;
+      ScheduleDataMap[I][I] = SD;
+      SD->Inst = I;
     }
     assert(!isInSchedulingRegion(SD) &&
            "new ScheduleData already in scheduling region");
@@ -17516,11 +17990,20 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
 
   while (!WorkList.empty()) {
     ScheduleData *SD = WorkList.pop_back_val();
+    bool ResetDeps = false;
+    for (ScheduleData *BundleMember = SD; BundleMember;
+         BundleMember = BundleMember->NextInBundle)
+      if (!BundleMember->hasValidDependencies())
+        ResetDeps = true;
+
     for (ScheduleData *BundleMember = SD; BundleMember;
          BundleMember = BundleMember->NextInBundle) {
       assert(isInSchedulingRegion(BundleMember));
-      if (BundleMember->hasValidDependencies())
+      if (BundleMember->hasValidDependencies()) {
+        if (ResetDeps)
+          BundleMember->resetUnscheduledDeps();
         continue;
+      }
 
       LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
                  << "\n");
@@ -17529,26 +18012,32 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
 
       // Handle def-use chain dependencies.
       for (User *U : BundleMember->Inst->users()) {
-        if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
+        if (auto *I = dyn_cast<Instruction>(U)) {
+          doForAllOpcodes(I, [&](ScheduleData *UseSD) {
+            ScheduleData *DestBundle = UseSD->FirstInBundle;
+            if ((UseSD->IsCopy || BundleMember->IsCopy) &&
+                DestBundle == BundleMember->FirstInBundle)
+              return;
+            BundleMember->Dependencies++;
+            if (!DestBundle->IsScheduled)
+              BundleMember->incrementUnscheduledDeps(1);
+            if (!DestBundle->hasValidDependencies())
+              WorkList.push_back(DestBundle);
+          });
+        }
+      }
+
+      auto MakeControlDependent = [&](Instruction *I) {
+        doForAllOpcodes(I, [&](ScheduleData *DepDest) {
+          assert(DepDest && "must be in schedule window");
+          DepDest->ControlDependencies.push_back(BundleMember);
           BundleMember->Dependencies++;
-          ScheduleData *DestBundle = UseSD->FirstInBundle;
+          ScheduleData *DestBundle = DepDest->FirstInBundle;
           if (!DestBundle->IsScheduled)
             BundleMember->incrementUnscheduledDeps(1);
           if (!DestBundle->hasValidDependencies())
             WorkList.push_back(DestBundle);
-        }
-      }
-
-      auto MakeControlDependent = [&](Instruction *I) {
-        auto *DepDest = getScheduleData(I);
-        assert(DepDest && "must be in schedule window");
-        DepDest->ControlDependencies.push_back(BundleMember);
-        BundleMember->Dependencies++;
-        ScheduleData *DestBundle = DepDest->FirstInBundle;
-        if (!DestBundle->IsScheduled)
-          BundleMember->incrementUnscheduledDeps(1);
-        if (!DestBundle->hasValidDependencies())
-          WorkList.push_back(DestBundle);
+        });
       };
 
       // Any instruction which isn't safe to speculate at the beginning of the
@@ -17684,12 +18173,12 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
   assert(ScheduleStart &&
          "tried to reset schedule on block which has not been scheduled");
   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
-    if (ScheduleData *SD = getScheduleData(I)) {
+    doForAllOpcodes(I, [&](ScheduleData *SD) {
       assert(isInSchedulingRegion(SD) &&
              "ScheduleData not in scheduling region");
       SD->IsScheduled = false;
       SD->resetUnscheduledDeps();
-    }
+    });
   }
   ReadyInsts.clear();
 }
@@ -17718,44 +18207,99 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
     }
   };
   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
+  SmallDenseMap<Value *, ScheduleData *> CopyElementsMap;
 
   // Ensure that all dependency data is updated (for nodes in the sub-graph)
   // and fill the ready-list with initial instructions.
   int Idx = 0;
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
-    if (ScheduleData *SD = BS->getScheduleData(I)) {
+    BS->doForAllOpcodes(I, [this, &Idx, &CopyElementsMap,
+                            BS](ScheduleData *SD) {
       [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(SD->Inst);
-      assert((isVectorLikeInstWithConstOps(SD->Inst) ||
+      assert((isVectorLikeInstWithConstOps(SD->Inst) || SD->IsCopy ||
               SD->isPartOfBundle() ==
                   (!SDTEs.empty() &&
                    !doesNotNeedToSchedule(SDTEs.front()->Scalars))) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
+      for (TreeEntry *SDTE : SDTEs)
+        if (SDTE && SDTE->isAltOpCopy()) {
+          ScheduleData *Bundle = SD->FirstInBundle;
+          for (ScheduleData *BundleMember = Bundle; BundleMember;
+               BundleMember = BundleMember->NextInBundle) {
+            if (BundleMember->IsCopy)
+              CopyElementsMap[BundleMember->Inst] = Bundle;
+          }
+        }
 
       if (SD->isSchedulingEntity() && SD->isPartOfBundle())
         BS->calculateDependencies(SD, false, this);
-    }
+    });
   }
   BS->initialFillReadyList(ReadyInsts);
 
   Instruction *LastScheduledInst = BS->ScheduleEnd;
+  DenseMap<ScheduleData *, ScheduleData *> ReschedMap;
+
+  auto ReorderBundle = [this](ScheduleData *SD) {
+    SmallVector<Instruction *, 2> Insts;
+    TreeEntry *SDTE = SD->TE;
+    if (SDTE && SDTE->isAltOpCopy()) {
+      unsigned Opcode = SD->TE->getOpcode();
+      for (ScheduleData *BundleMember = SD; BundleMember;
+           BundleMember = BundleMember->NextInBundle) {
+        if (BundleMember->Inst->getOpcode() != Opcode) {
+          Insts.push_back(BundleMember->Inst);
+        } else {
+          Insts.insert(Insts.begin(), BundleMember->Inst);
+        }
+      }
+    } else {
+      SmallVector<Instruction *, 2> InstrSched;
+      for (ScheduleData *BundleMember = SD; BundleMember;
+           BundleMember = BundleMember->NextInBundle) {
+        if (CopyableAltOp.contains(BundleMember->Inst))
+          Insts.insert(Insts.begin(), BundleMember->Inst);
+        else
+          Insts.push_back(BundleMember->Inst);
+      }
+    }
+    return Insts;
+  };
 
   // Do the "real" scheduling.
   while (!ReadyInsts.empty()) {
     ScheduleData *Picked = *ReadyInsts.begin();
     ReadyInsts.erase(ReadyInsts.begin());
 
-    // Move the scheduled instruction(s) to their dedicated places, if not
-    // there yet.
+    // Reorder copyable elements to emit after main operations.
     for (ScheduleData *BundleMember = Picked; BundleMember;
          BundleMember = BundleMember->NextInBundle) {
-      Instruction *PickedInst = BundleMember->Inst;
-      if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
+      if (CopyableAltOp.contains(BundleMember->Inst)) {
+        ScheduleData *SD = CopyElementsMap[BundleMember->Inst];
+        if (SD && SD->FirstInBundle != Picked)
+          ReschedMap[SD] = Picked;
+      }
+    }
+
+    // Move the scheduled instruction(s) to their dedicated places, if not
+    // there yet.
+    for (Instruction *PickedInst : ReorderBundle(Picked)) {
+      if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst &&
+          LastScheduledInst->getPrevNode())
         PickedInst->moveAfter(LastScheduledInst->getPrevNode());
       LastScheduledInst = PickedInst;
     }
-
+    if (ReschedMap.contains(Picked)) {
+      ScheduleData *Resched = ReschedMap[Picked];
+      for (Instruction *PickedInst : ReorderBundle(Resched)) {
+        if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst &&
+            LastScheduledInst->getPrevNode())
+          PickedInst->moveAfter(LastScheduledInst->getPrevNode());
+        LastScheduledInst = PickedInst;
+      }
+    }
     BS->schedule(Picked, ReadyInsts);
   }
 
@@ -17767,9 +18311,10 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
   // Check that all schedulable entities got scheduled
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
-    ScheduleData *SD = BS->getScheduleData(I);
-    if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
-      assert(SD->IsScheduled && "must be scheduled at this point");
+    BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
+      if (SD->isSchedulingEntity() && SD->hasValidDependencies())
+        assert(SD->IsScheduled && "must be scheduled at this point");
+    });
   }
 #endif
 
@@ -17880,6 +18425,9 @@ bool BoUpSLP::collectValuesToDemote(
   if (NodesToKeepBWs.contains(E.Idx))
     return false;
 
+  if (E.isAltOpCopy())
+    return false;
+
   // If the value is not a vectorized instruction in the expression and not used
   // by the insertelement instruction and not used in multiple vector nodes, it
   // cannot be demoted.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 869a9d1aee80e..7fa746dc758a9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
 ; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -slp-vectorize-copyable=true -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,COPYABLE %s
 
 define void @add0(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @add0(
@@ -60,6 +61,13 @@ define void @add1(ptr noalias %dst, ptr noalias %src) {
 ; POW2-ONLY-NEXT:    store i32 [[ADD9]], ptr [[INCDEC_PTR7]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
+; COPYABLE-LABEL: @add1(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
+; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    ret void
+;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
   %0 = load i32, ptr %src, align 4
@@ -82,21 +90,44 @@ entry:
 }
 
 define void @sub0(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @sub0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @sub0(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; NON-POW2-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; NON-POW2-NEXT:    store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @sub0(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; POW2-ONLY-NEXT:    store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @sub0(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 -1, i32 0, i32 -2, i32 -3>
+; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -180,23 +211,55 @@ entry:
 }
 
 define void @addsub0(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @addsub0(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; NON-POW2-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; NON-POW2-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; NON-POW2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @addsub0(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @addsub0(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 0>
+; COPYABLE-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; COPYABLE-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; COPYABLE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; COPYABLE-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -220,23 +283,55 @@ entry:
 }
 
 define void @addsub1(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @addsub1(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; NON-POW2-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; NON-POW2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; NON-POW2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; NON-POW2-NEXT:    store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
+; NON-POW2-NEXT:    store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @addsub1(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
+; POW2-ONLY-NEXT:    store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @addsub1(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; COPYABLE-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; COPYABLE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; COPYABLE-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT:    [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP4]], <i32 0, i32 -3>
+; COPYABLE-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -260,21 +355,44 @@ entry:
 }
 
 define void @mul(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @mul(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
-; CHECK-NEXT:    store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @mul(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
+; NON-POW2-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; NON-POW2-NEXT:    store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; NON-POW2-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; NON-POW2-NEXT:    store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @mul(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
+; POW2-ONLY-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; POW2-ONLY-NEXT:    store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @mul(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[TMP0]], <i32 257, i32 -3, i32 1, i32 -9>
+; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -325,6 +443,13 @@ define void @shl0(ptr noalias %dst, ptr noalias %src) {
 ; POW2-ONLY-NEXT:    store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
+; COPYABLE-LABEL: @shl0(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
+; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    ret void
+;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
   %0 = load i32, ptr %src, align 4
@@ -434,6 +559,13 @@ define void @add1f(ptr noalias %dst, ptr noalias %src) {
 ; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
+; COPYABLE-LABEL: @add1f(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; COPYABLE-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    ret void
+;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
   %0 = load float, ptr %src, align 4
@@ -456,21 +588,44 @@ entry:
 }
 
 define void @sub0f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @sub0f(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @sub0f(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store float [[ADD]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; NON-POW2-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; NON-POW2-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @sub0f(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store float [[ADD]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @sub0f(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; COPYABLE-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -554,23 +709,55 @@ entry:
 }
 
 define void @addsub0f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub0f(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @addsub0f(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store float [[SUB]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; NON-POW2-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; NON-POW2-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; NON-POW2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @addsub0f(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store float [[SUB]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @addsub0f(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00>
+; COPYABLE-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; COPYABLE-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; COPYABLE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; COPYABLE-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -594,23 +781,55 @@ entry:
 }
 
 define void @addsub1f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub1f(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
-; CHECK-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @addsub1f(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; NON-POW2-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; NON-POW2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; NON-POW2-NEXT:    [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; NON-POW2-NEXT:    store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT:    [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
+; NON-POW2-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @addsub1f(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
+; POW2-ONLY-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @addsub1f(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; COPYABLE-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; COPYABLE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; COPYABLE-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP4]], <float 0.000000e+00, float -3.000000e+00>
+; COPYABLE-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -634,21 +853,44 @@ entry:
 }
 
 define void @mulf(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @mulf(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @mulf(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; NON-POW2-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; NON-POW2-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; NON-POW2-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; NON-POW2-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @mulf(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; POW2-ONLY-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @mulf(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = fmul fast <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
+; COPYABLE-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -729,6 +971,22 @@ define void @add1fn(ptr noalias %dst, ptr noalias %src) {
 ; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
+; COPYABLE-LABEL: @add1fn(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; COPYABLE-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; COPYABLE-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; COPYABLE-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; COPYABLE-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; COPYABLE-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; COPYABLE-NEXT:    ret void
+;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
   %0 = load float, ptr %src, align 4
@@ -849,21 +1107,49 @@ entry:
 }
 
 define void @mulfn(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @mulfn(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @mulfn(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; NON-POW2-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; NON-POW2-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; NON-POW2-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; NON-POW2-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @mulfn(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; POW2-ONLY-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @mulfn(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; COPYABLE-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], <float 1.000000e+00, float -9.000000e+00>
+; COPYABLE-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -885,3 +1171,154 @@ entry:
   store float %sub9, ptr %incdec.ptr7, align 4
   ret void
 }
+
+define void @and_lshr(ptr %0, ptr %1, float %2, float %3) {
+; NON-POW2-LABEL: @and_lshr(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP1:%.*]], align 1
+; NON-POW2-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; NON-POW2-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP5]], 2
+; NON-POW2-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP5]], 4
+; NON-POW2-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; NON-POW2-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP6]], i32 1
+; NON-POW2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP7]], i32 2
+; NON-POW2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+; NON-POW2-NEXT:    [[TMP12:%.*]] = and <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; NON-POW2-NEXT:    [[TMP13:%.*]] = lshr <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; NON-POW2-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; NON-POW2-NEXT:    [[TMP15:%.*]] = sitofp <4 x i32> [[TMP14]] to <4 x float>
+; NON-POW2-NEXT:    [[TMP16:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 4
+; NON-POW2-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> poison, float [[TMP3:%.*]], i32 0
+; NON-POW2-NEXT:    [[TMP18:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> poison, <4 x i32> zeroinitializer
+; NON-POW2-NEXT:    [[TMP19:%.*]] = fadd <4 x float> [[TMP16]], [[TMP18]]
+; NON-POW2-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP2:%.*]], i32 0
+; NON-POW2-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <4 x i32> zeroinitializer
+; NON-POW2-NEXT:    [[TMP22:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP21]], <4 x float> [[TMP15]], <4 x float> [[TMP19]])
+; NON-POW2-NEXT:    store <4 x float> [[TMP22]], ptr [[TMP0]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @and_lshr(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP1:%.*]], align 1
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; POW2-ONLY-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP5]], 2
+; POW2-ONLY-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP5]], 4
+; POW2-ONLY-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; POW2-ONLY-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP6]], i32 1
+; POW2-ONLY-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP7]], i32 2
+; POW2-ONLY-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+; POW2-ONLY-NEXT:    [[TMP12:%.*]] = and <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; POW2-ONLY-NEXT:    [[TMP13:%.*]] = lshr <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; POW2-ONLY-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; POW2-ONLY-NEXT:    [[TMP15:%.*]] = sitofp <4 x i32> [[TMP14]] to <4 x float>
+; POW2-ONLY-NEXT:    [[TMP16:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 4
+; POW2-ONLY-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> poison, float [[TMP3:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[TMP18:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> poison, <4 x i32> zeroinitializer
+; POW2-ONLY-NEXT:    [[TMP19:%.*]] = fadd <4 x float> [[TMP16]], [[TMP18]]
+; POW2-ONLY-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP2:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <4 x i32> zeroinitializer
+; POW2-ONLY-NEXT:    [[TMP22:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP21]], <4 x float> [[TMP15]], <4 x float> [[TMP19]])
+; POW2-ONLY-NEXT:    store <4 x float> [[TMP22]], ptr [[TMP0]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @and_lshr(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP1:%.*]], align 1
+; COPYABLE-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; COPYABLE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; COPYABLE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
+; COPYABLE-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 0, i32 2, i32 4, i32 6>
+; COPYABLE-NEXT:    [[TMP9:%.*]] = and <4 x i32> [[TMP8]], <i32 3, i32 3, i32 3, i32 -1>
+; COPYABLE-NEXT:    [[TMP10:%.*]] = sitofp <4 x i32> [[TMP9]] to <4 x float>
+; COPYABLE-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP3:%.*]], i32 0
+; COPYABLE-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> poison, <4 x i32> zeroinitializer
+; COPYABLE-NEXT:    [[TMP14:%.*]] = fadd <4 x float> [[TMP11]], [[TMP13]]
+; COPYABLE-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[TMP2:%.*]], i32 0
+; COPYABLE-NEXT:    [[TMP16:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> poison, <4 x i32> zeroinitializer
+; COPYABLE-NEXT:    [[TMP17:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP16]], <4 x float> [[TMP10]], <4 x float> [[TMP14]])
+; COPYABLE-NEXT:    store <4 x float> [[TMP17]], ptr [[TMP0]], align 4
+; COPYABLE-NEXT:    ret void
+;
+entry:
+  %5 = getelementptr inbounds float, ptr %0, i64 1
+  %6 = getelementptr inbounds float, ptr %0, i64 2
+  %7 = getelementptr inbounds float, ptr %0, i64 3
+  %8 = load i8, ptr %1, align 1
+  %9 = zext i8 %8 to i32
+  %10 = and i32 %9, 3
+  %11 = sitofp i32 %10 to float
+  %12 = lshr i32 %9, 2
+  %13 = and i32 %12, 3
+  %14 = sitofp i32 %13 to float
+  %15 = lshr i32 %9, 4
+  %16 = and i32 %15, 3
+  %17 = sitofp i32 %16 to float
+  %18 = lshr i32 %9, 6
+  %19 = sitofp i32 %18 to float
+  %20 = load float, ptr %0, align 4
+  %21 = fadd float %20, %3
+  %22 = tail call noundef float @llvm.fma.f32(float %2, float %11, float %21)
+  store float %22, ptr %0, align 4
+  %23 = load float, ptr %5, align 4
+  %24 = fadd float %23, %3
+  %25 = tail call noundef float @llvm.fma.f32(float %2, float %14, float %24)
+  store float %25, ptr %5, align 4
+  %26 = load float, ptr %6, align 4
+  %27 = fadd float %26, %3
+  %28 = tail call noundef float @llvm.fma.f32(float %2, float %17, float %27)
+  store float %28, ptr %6, align 4
+  %29 = load float, ptr %7, align 4
+  %30 = fadd float %29, %3
+  %31 = tail call noundef float @llvm.fma.f32(float %2, float %19, float %30)
+  store float %31, ptr %7, align 4
+  ret void
+}
+
+define void @add_shl(ptr %sinfo) {
+; NON-POW2-LABEL: @add_shl(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[SHL_I:%.*]] = shl i32 0, 0
+; NON-POW2-NEXT:    [[END_CODE_I:%.*]] = getelementptr i8, ptr [[SINFO:%.*]], i64 348
+; NON-POW2-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[SHL_I]], i32 3
+; NON-POW2-NEXT:    [[TMP1:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]]
+; NON-POW2-NEXT:    [[TMP2:%.*]] = shl <4 x i32> zeroinitializer, [[TMP0]]
+; NON-POW2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; NON-POW2-NEXT:    store <4 x i32> [[TMP3]], ptr [[END_CODE_I]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @add_shl(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[SHL_I:%.*]] = shl i32 0, 0
+; POW2-ONLY-NEXT:    [[END_CODE_I:%.*]] = getelementptr i8, ptr [[SINFO:%.*]], i64 348
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[SHL_I]], i32 3
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]]
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = shl <4 x i32> zeroinitializer, [[TMP0]]
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; POW2-ONLY-NEXT:    store <4 x i32> [[TMP3]], ptr [[END_CODE_I]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @add_shl(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[END_CODE_I:%.*]] = getelementptr i8, ptr [[SINFO:%.*]], i64 348
+; COPYABLE-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, <2 x i32> zeroinitializer, i64 2)
+; COPYABLE-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[TMP0]], zeroinitializer
+; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[END_CODE_I]], align 4
+; COPYABLE-NEXT:    ret void
+;
+entry:
+  %shl.i = shl i32 0, 0
+  %add.i = add i32 0, 0
+  %end_code.i = getelementptr i8, ptr %sinfo, i64 348
+  store i32 %add.i, ptr %end_code.i, align 4
+  %add.i.i = add i32 0, 0
+  %code_size.i.i = getelementptr i8, ptr %sinfo, i64 352
+  store i32 %add.i.i, ptr %code_size.i.i, align 8
+  %shl.i.i = shl i32 0, 0
+  %limit_code.i.i = getelementptr i8, ptr %sinfo, i64 356
+  store i32 %shl.i.i, ptr %limit_code.i.i, align 4
+  %add2.i.i = add i32 %shl.i, 0
+  %max_code.i.i = getelementptr i8, ptr %sinfo, i64 360
+  store i32 %add2.i.i, ptr %max_code.i.i, align 8
+  ret void
+}

>From d0dc242f1b18aba43fa57062b7f4b7e5ffa593e5 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at gmail.com>
Date: Wed, 26 Feb 2025 13:11:56 +0000
Subject: [PATCH 2/9] Add method to schedule copy instructions in
 BlockScheduling.schedule(), removed ReschedMap from BoUpSLP::scheduleBlock().

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 80 +++++++++++--------
 1 file changed, 48 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5225eb2b2eefa..6b6c722ad7259 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3835,16 +3835,16 @@ class BoUpSLP {
           continue;
         auto It = ScalarToTreeEntries.find(V);
         Instruction *I = dyn_cast<Instruction>(V);
-        bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
-        if (S.isAltOpCopy() && IsAltInst) {
-          CopyableAltOp.insert(V);
-          continue;
-        }
         assert(
             (It == ScalarToTreeEntries.end() ||
              (It->getSecond().size() == 1 && It->getSecond().front() == Last) ||
              doesNotNeedToBeScheduled(V)) &&
             "Scalar already in tree!");
+        bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
+        if (S.isAltOpCopy() && IsAltInst) {
+          CopyableAltOp[V] = Last;
+          continue;
+        }
         if (It == ScalarToTreeEntries.end()) {
           ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
           (void)Processed.insert(V);
@@ -3954,8 +3954,8 @@ class BoUpSLP {
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
-  /// A set op scalars that we are considoring as copyable operations.
-  ValueSet CopyableAltOp;
+  /// Maps a scalar copies to the its tree entry(ies).
+  SmallDenseMap<Value *, TreeEntry *> CopyableAltOp;
 
   /// A set of first non-schedulable values.
   ValueSet NonScheduledFirst;
@@ -4264,6 +4264,9 @@ class BoUpSLP {
 
     /// True if this instruction is a copy.
     bool IsCopy = false;
+
+    /// Points to where copyable instruction was introduced.
+    ScheduleData *CopyInst = nullptr;
   };
 
 #ifndef NDEBUG
@@ -4413,6 +4416,23 @@ class BoUpSLP {
           for (Use &U : BundleMember->Inst->operands())
             if (auto *I = dyn_cast<Instruction>(U.get()))
               DecrUnsched(I);
+          // Handle a copy instruction dependencies.
+          if (TE && TE->isAltOpCopy() && BundleMember->IsCopy) {
+            doForAllOpcodes(BundleMember->Inst, [BundleMember, &ReadyList](
+                                                    ScheduleData *CopyUse) {
+              if (BundleMember != CopyUse && CopyUse->hasValidDependencies() &&
+                  CopyUse->incrementUnscheduledDeps(-1) == 0) {
+                ScheduleData *DepBundle = CopyUse->FirstInBundle;
+                assert(!DepBundle->IsScheduled &&
+                       "already scheduled bundle gets ready");
+                if (DepBundle->isReady()) {
+                  ReadyList.insert(DepBundle);
+                  LLVM_DEBUG(dbgs() << "SLP:    gets ready (copyable): "
+                                    << *DepBundle << "\n");
+                }
+              }
+            });
+          }
         }
         // Handle the memory dependencies.
         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
@@ -4498,8 +4518,8 @@ class BoUpSLP {
 
     /// Build a bundle from the ScheduleData nodes corresponding to the
     /// scalar instruction for each lane.
-    ScheduleData *buildBundle(ArrayRef<Value *> VL, const InstructionsState &S,
-                              bool &ReSchedule);
+    ScheduleData *buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+                              const InstructionsState &S, bool &ReSchedule);
 
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
@@ -17606,8 +17626,10 @@ void BoUpSLP::optimizeGatherSequence() {
   GatherShuffleExtractSeq.clear();
 }
 
-BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::buildBundle(
-    ArrayRef<Value *> VL, const InstructionsState &S, bool &ReSchedule) {
+BoUpSLP::ScheduleData *
+BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+                                      const InstructionsState &S,
+                                      bool &ReSchedule) {
   ScheduleData *Bundle = nullptr;
   ScheduleData *PrevInBundle = nullptr;
   unsigned Opcode = S.getOpcode();
@@ -17675,6 +17697,13 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::buildBundle(
     if (S.isAltOpCopy() && IsAltInst)
       BundleMember->IsCopy = true;
     PrevInBundle = BundleMember;
+    if (SLP->CopyableAltOp.contains(I)) {
+      TreeEntry *TE = SLP->CopyableAltOp[I];
+      assert(TE && "Incorrect state");
+      ScheduleData *SD = getScheduleData(I, TE);
+      assert(SD && SD->IsCopy && "ScheduleData incorrect state");
+      BundleMember->CopyInst = SD;
+    }
   }
   assert(Bundle && "Failed to find schedule bundle");
   return Bundle;
@@ -17772,7 +17801,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     ReSchedule = true;
   }
 
-  auto *Bundle = buildBundle(VL, S, ReSchedule);
+  auto *Bundle = buildBundle(VL, SLP, S, ReSchedule);
   if (!Bundle)
     return std::nullopt;
   TryScheduleBundleImpl(ReSchedule, Bundle);
@@ -17820,6 +17849,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
     BundleMember->NextInBundle = nullptr;
     BundleMember->TE = nullptr;
     BundleMember->IsCopy = false;
+    BundleMember->CopyInst = nullptr;
     if (BundleMember->unscheduledDepsInBundle() == 0) {
       ReadyInsts.insert(BundleMember);
     }
@@ -18010,6 +18040,12 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
       BundleMember->Dependencies = 0;
       BundleMember->resetUnscheduledDeps();
 
+      // Handle copy instruction dependencies.
+      if (BundleMember->CopyInst) {
+        BundleMember->Dependencies++;
+        BundleMember->incrementUnscheduledDeps(1);
+      }
+
       // Handle def-use chain dependencies.
       for (User *U : BundleMember->Inst->users()) {
         if (auto *I = dyn_cast<Instruction>(U)) {
@@ -18240,7 +18276,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   BS->initialFillReadyList(ReadyInsts);
 
   Instruction *LastScheduledInst = BS->ScheduleEnd;
-  DenseMap<ScheduleData *, ScheduleData *> ReschedMap;
 
   auto ReorderBundle = [this](ScheduleData *SD) {
     SmallVector<Instruction *, 2> Insts;
@@ -18273,16 +18308,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
     ScheduleData *Picked = *ReadyInsts.begin();
     ReadyInsts.erase(ReadyInsts.begin());
 
-    // Reorder copyable elements to emit after main operations.
-    for (ScheduleData *BundleMember = Picked; BundleMember;
-         BundleMember = BundleMember->NextInBundle) {
-      if (CopyableAltOp.contains(BundleMember->Inst)) {
-        ScheduleData *SD = CopyElementsMap[BundleMember->Inst];
-        if (SD && SD->FirstInBundle != Picked)
-          ReschedMap[SD] = Picked;
-      }
-    }
-
     // Move the scheduled instruction(s) to their dedicated places, if not
     // there yet.
     for (Instruction *PickedInst : ReorderBundle(Picked)) {
@@ -18291,15 +18316,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
         PickedInst->moveAfter(LastScheduledInst->getPrevNode());
       LastScheduledInst = PickedInst;
     }
-    if (ReschedMap.contains(Picked)) {
-      ScheduleData *Resched = ReschedMap[Picked];
-      for (Instruction *PickedInst : ReorderBundle(Resched)) {
-        if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst &&
-            LastScheduledInst->getPrevNode())
-          PickedInst->moveAfter(LastScheduledInst->getPrevNode());
-        LastScheduledInst = PickedInst;
-      }
-    }
     BS->schedule(Picked, ReadyInsts);
   }
 

>From e9bd6d47df66d9cf45951d04b3b147fde10c5534 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at gmail.com>
Date: Mon, 3 Mar 2025 22:02:00 +0000
Subject: [PATCH 3/9] Resolved comments by removing IsAltOpCopy flag from
 InstructionsState. Resotored original logic to handle just homogeneous
 operations in getSameOpcode(). Removed checkCopyableInnerDep() with replacing
 functionality() with replacing with schedular.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 356 ++++++-----
 .../X86/vect_copyable_in_binops.ll            | 579 ++++++------------
 2 files changed, 352 insertions(+), 583 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6b6c722ad7259..24f47f5abd692 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -854,9 +854,6 @@ class InstructionsState {
   Instruction *MainOp = nullptr;
   Instruction *AltOp = nullptr;
 
-  /// True if alterative operation is copy instruction.
-  bool IsAltOpCopy = false;
-
 public:
   Instruction *getMainOp() const {
     assert(valid() && "InstructionsState is invalid.");
@@ -873,13 +870,9 @@ class InstructionsState {
 
   unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
 
-  bool isAltOpCopy() const { return IsAltOpCopy; }
-
   /// Some of the instructions in the list have alternate opcodes.
   bool isAltShuffle() const { return getMainOp() != getAltOp(); }
 
-  void setAltOpCopy(bool Val) { IsAltOpCopy = Val; }
-
   bool isOpcodeOrAlt(Instruction *I) const {
     unsigned CheckedOpcode = I->getOpcode();
     return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
@@ -920,18 +913,17 @@ static bool isValidForAlternation(unsigned Opcode) {
   return true;
 }
 
-// Check for inner dependencies, we could not support such depenedies if it
-// comes from a main operaion, only from alternative or for now we ignore
-// alternative operations depenedies to any alternative.
-static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
-                                  const InstructionsState &S);
-
-// Determine that the vector could be vectorized with copyable elements.
 static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt);
 
 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                                        const TargetLibraryInfo &TLI);
 
+static InstructionsState getCopyableOpcode(ArrayRef<Value *> VL,
+                                           const TargetLibraryInfo &TLI);
+
+static InstructionsState getCombinedOpcode(ArrayRef<Value *> VL,
+                                           const TargetLibraryInfo &TLI);
+
 /// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
 /// compatible instructions or constants, or just some other regular values.
 static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
@@ -981,51 +973,19 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     return InstructionsState::invalid();
 
   Instruction *MainOp = cast<Instruction>(*It);
-  Instruction *AltOp = MainOp;
-  unsigned Opcode = MainOp->getOpcode();
-  unsigned AltOpcode = Opcode;
-  for (Value *V : iterator_range(It + 1, VL.end())) {
-    Instruction *Inst = dyn_cast<Instruction>(V);
-    if (!Inst)
-      continue;
-    unsigned VOpcode = Inst->getOpcode();
-    if (Inst && AltOpcode == Opcode && !isa<PHINode>(Inst) &&
-        VOpcode != Opcode && isValidForAlternation(VOpcode)) {
-      AltOpcode = VOpcode;
-      AltOp = Inst;
-      break;
-    }
-  }
   unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
   if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
       (VL.size() == 2 && InstCnt < 2))
     return InstructionsState::invalid();
-  bool IsBinOp = isa<BinaryOperator>(MainOp);
-  bool IsCopyable = false;
-  if (MainOp && AltOp && MainOp != AltOp) {
-    if (!IsBinOp && isa<BinaryOperator>(AltOp) && !isa<PHINode>(MainOp)) {
-      std::swap(MainOp, AltOp);
-      std::swap(AltOpcode, Opcode);
-      IsBinOp = true;
-    }
-    IsCopyable = VectorizeCopyable && isCopyableOp(VL, MainOp, AltOp);
-    if (IsCopyable && isa<CmpInst>(AltOp)) {
-      Type *Ty0 = MainOp->getOperand(0)->getType();
-      Type *Ty1 = AltOp->getOperand(0)->getType();
-      if (Ty0 != Ty1)
-        return InstructionsState::invalid();
-    } else if (!IsCopyable) {
-      MainOp = cast<Instruction>(*It);
-      AltOp = MainOp;
-      Opcode = MainOp->getOpcode();
-      AltOpcode = Opcode;
-      IsBinOp = isa<BinaryOperator>(MainOp);
-    }
-  }
+
   bool IsCastOp = isa<CastInst>(MainOp);
+  bool IsBinOp = isa<BinaryOperator>(MainOp);
   bool IsCmpOp = isa<CmpInst>(MainOp);
   CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
                                         : CmpInst::BAD_ICMP_PREDICATE;
+  Instruction *AltOp = MainOp;
+  unsigned Opcode = MainOp->getOpcode();
+  unsigned AltOpcode = Opcode;
 
   bool SwappedPredsCompatible = IsCmpOp && [&]() {
     SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
@@ -1081,12 +1041,12 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
         AltOp = I;
         continue;
       }
-    } else if ((IsCastOp || IsCopyable) && isa<CastInst>(I)) {
+    } else if (IsCastOp && isa<CastInst>(I)) {
       Value *Op0 = MainOp->getOperand(0);
       Type *Ty0 = Op0->getType();
       Value *Op1 = I->getOperand(0);
       Type *Ty1 = Op1->getType();
-      if (Ty0 == Ty1 || IsCopyable) {
+      if (Ty0 == Ty1) {
         if (InstOpcode == Opcode || InstOpcode == AltOpcode)
           continue;
         if (Opcode == AltOpcode) {
@@ -1098,15 +1058,13 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
           continue;
         }
       }
-    } else if (auto *Inst = dyn_cast<CmpInst>(I);
-               Inst && (IsCmpOp || IsCopyable)) {
+    } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
       auto *BaseInst = cast<CmpInst>(MainOp);
       Type *Ty0 = BaseInst->getOperand(0)->getType();
       Type *Ty1 = Inst->getOperand(0)->getType();
       if (Ty0 == Ty1) {
-        assert((IsCopyable || InstOpcode == Opcode) &&
-               "Expected same CmpInst opcode.");
-        assert((IsCopyable || InstOpcode == AltOpcode) &&
+        assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
+        assert(InstOpcode == AltOpcode &&
                "Alternate instructions are only supported by BinaryOperator "
                "and CastInst.");
         // Check for compatible operands. If the corresponding operands are not
@@ -1137,32 +1095,23 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
             AltPred == CurrentPred || AltPred == SwappedCurrentPred)
           continue;
       }
-    } else if (InstOpcode == Opcode ||
-               (IsCopyable && InstOpcode == AltOpcode)) {
-      assert((IsCopyable || InstOpcode == AltOpcode) &&
+    } else if (InstOpcode == Opcode) {
+      assert(InstOpcode == AltOpcode &&
              "Alternate instructions are only supported by BinaryOperator and "
              "CastInst.");
-      Instruction *Op = MainOp;
-      if (IsCopyable) {
-        if (InstOpcode != Opcode && InstOpcode != AltOpcode) {
-          Op = I;
-        } else if (Opcode != AltOpcode && InstOpcode == AltOpcode) {
-          Op = AltOp;
-        }
-      }
       if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
         if (Gep->getNumOperands() != 2 ||
-            Gep->getOperand(0)->getType() != Op->getOperand(0)->getType())
+            Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
           return InstructionsState::invalid();
       } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
         if (!isVectorLikeInstWithConstOps(EI))
           return InstructionsState::invalid();
       } else if (auto *LI = dyn_cast<LoadInst>(I)) {
-        auto *BaseLI = cast<LoadInst>(Op);
+        auto *BaseLI = cast<LoadInst>(MainOp);
         if (!LI->isSimple() || !BaseLI->isSimple())
           return InstructionsState::invalid();
       } else if (auto *Call = dyn_cast<CallInst>(I)) {
-        auto *CallBase = cast<CallInst>(Op);
+        auto *CallBase = cast<CallInst>(MainOp);
         if (Call->getCalledFunction() != CallBase->getCalledFunction())
           return InstructionsState::invalid();
         if (Call->hasOperandBundles() &&
@@ -1177,14 +1126,13 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
           return InstructionsState::invalid();
         if (!ID) {
           SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
-          if (Mappings.size() &&
-              (Mappings.size() != BaseMappings.size() ||
-               Mappings.front().ISA != BaseMappings.front().ISA ||
-               Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
-               Mappings.front().VectorName != BaseMappings.front().VectorName ||
-               Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
-               Mappings.front().Shape.Parameters !=
-                   BaseMappings.front().Shape.Parameters))
+          if (Mappings.size() != BaseMappings.size() ||
+              Mappings.front().ISA != BaseMappings.front().ISA ||
+              Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
+              Mappings.front().VectorName != BaseMappings.front().VectorName ||
+              Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
+              Mappings.front().Shape.Parameters !=
+                  BaseMappings.front().Shape.Parameters)
             return InstructionsState::invalid();
         }
       }
@@ -1196,6 +1144,69 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
   return InstructionsState(MainOp, AltOp);
 }
 
+/// \returns analysis of the Instructions in \p VL described in
+/// InstructionsState in propose to vectorize with copyable instructions.
+static InstructionsState getCopyableOpcode(ArrayRef<Value *> VL,
+                                           const TargetLibraryInfo &TLI) {
+  if (!all_of(VL, IsaPred<Instruction>))
+    return InstructionsState::invalid();
+  Instruction *MainOp = dyn_cast<Instruction>(VL[0]);
+  Instruction *AltOp = nullptr;
+  unsigned Opcode = MainOp->getOpcode();
+  unsigned AltOpcode = Opcode;
+  if (MainOp && VectorizeCopyable && all_of(VL, IsaPred<Instruction>)) {
+    for (Value *V : VL) {
+      Instruction *I = cast<Instruction>(V);
+      if (I->isIntDivRem() || I->isFPDivRem())
+        return InstructionsState::invalid();
+      if (isa<PHINode>(I)) {
+        AltOp = nullptr;
+        break;
+      }
+      unsigned VOpcode = I->getOpcode();
+      if (VOpcode != Opcode) {
+        if (AltOpcode == Opcode) {
+          AltOpcode = VOpcode;
+          AltOp = I;
+        }
+        if (VOpcode != AltOpcode) {
+          AltOp = nullptr;
+          break;
+        }
+      }
+    }
+    if (AltOp) {
+      bool IsBinOp = isa<BinaryOperator>(MainOp);
+      bool IsAltBinOp = isa<BinaryOperator>(AltOp);
+      if (!IsBinOp && IsAltBinOp) {
+        std::swap(MainOp, AltOp);
+        std::swap(IsBinOp, IsAltBinOp);
+        std::swap(Opcode, AltOpcode);
+      }
+      if ((IsBinOp || IsAltBinOp) && !(IsBinOp && IsAltBinOp) &&
+          isCopyableOp(VL, MainOp, AltOp)) {
+        SmallVector<Value *, 8> MainOps, AltOps;
+        for (Value *V : VL) {
+          Instruction *I = cast<Instruction>(V);
+          if (I->getOpcode() == Opcode)
+            MainOps.push_back(I);
+          else
+            AltOps.push_back(I);
+        }
+        if (getSameOpcode(MainOps, TLI) && getSameOpcode(AltOps, TLI))
+          return InstructionsState(MainOp, AltOp);
+      }
+    }
+  }
+  return InstructionsState::invalid();
+}
+
+static InstructionsState getCombinedOpcode(ArrayRef<Value *> VL,
+                                           const TargetLibraryInfo &TLI) {
+  InstructionsState S = getSameOpcode(VL, TLI);
+  return (S) ? S : getCopyableOpcode(VL, TLI);
+}
+
 /// \returns true if all of the values in \p VL have the same type or false
 /// otherwise.
 static bool allSameType(ArrayRef<Value *> VL) {
@@ -1233,37 +1244,7 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
   }
 }
 
-static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
-                                  const InstructionsState &S) {
-  SmallSet<Value *, 4> Ops;
-  SmallSet<Value *, 4> AltOps;
-  unsigned Opcode = S.getOpcode();
-  for (Value *V : VL) {
-    auto *I = dyn_cast<Instruction>(V);
-    if (!I)
-      continue;
-    if (I->getOpcode() == Opcode)
-      Ops.insert(V);
-    else
-      AltOps.insert(V);
-  }
-  for (Value *V : VL) {
-    auto *I = dyn_cast<Instruction>(V);
-    if (!I)
-      continue;
-    for (Use &U : I->operands())
-      if (auto *Op = dyn_cast<Instruction>(U.get()); Op && Ops.contains(Op))
-        return false;
-    if (I->getOpcode() != Opcode) {
-      for (Use &U : I->operands())
-        if (auto *Op = dyn_cast<Instruction>(U.get());
-            Op && AltOps.contains(Op))
-          return false;
-    }
-  }
-  return true;
-}
-
+// Determine that the vector could be vectorized with copyable elements.
 static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) {
   if (any_of(VL, IsaPred<PoisonValue>) || Main == Alt ||
       !isa<BinaryOperator>(Main) || !isa<Instruction>(Alt) ||
@@ -1281,6 +1262,7 @@ static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) {
     return true;
   return false;
 }
+
 /// \returns the AA location that is being access by the instruction.
 static MemoryLocation getLocation(Instruction *I) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -3526,6 +3508,9 @@ class BoUpSLP {
     /// Interleaving factor for interleaved loads Vectorize nodes.
     unsigned InterleaveFactor = 0;
 
+    /// True if alterative operation is a copy instruction.
+    bool IsAltOpCopy = false;
+
   public:
     /// Returns interleave factor for interleave nodes.
     unsigned getInterleaveFactor() const { return InterleaveFactor; }
@@ -3581,7 +3566,7 @@ class BoUpSLP {
     }
 
     /// Some of the instructions in the list have alternate opcodes.
-    bool isAltShuffle() const { return S.isAltShuffle() && !S.isAltOpCopy(); }
+    bool isAltShuffle() const { return S.isAltShuffle() && !IsAltOpCopy; }
 
     bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
 
@@ -3609,7 +3594,9 @@ class BoUpSLP {
 
     unsigned getAltOpcode() const { return S.getAltOpcode(); }
 
-    bool isAltOpCopy() const { return S.isAltOpCopy(); }
+    bool isAltOpCopy() const { return IsAltOpCopy; }
+
+    void setAltOpCopy(bool Val) { IsAltOpCopy = Val; }
 
     bool hasState() const { return S.valid(); }
 
@@ -3710,7 +3697,7 @@ class BoUpSLP {
       if (S) {
         dbgs() << "MainOp: " << *S.getMainOp() << "\n";
         dbgs() << "AltOp: " << *S.getAltOp() << "\n";
-        dbgs() << "isAltOpCopy: " << S.isAltOpCopy() << "\n";
+        dbgs() << "IsAltOpCopy: " << IsAltOpCopy << "\n";
       } else {
         dbgs() << "MainOp: NULL\n";
         dbgs() << "AltOp: NULL\n";
@@ -3764,29 +3751,27 @@ class BoUpSLP {
 #endif
 
   /// Create a new VectorizableTree entry.
-  TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
-                          std::optional<ScheduleData *> Bundle,
-                          const InstructionsState &S,
-                          const EdgeInfo &UserTreeIdx,
-                          ArrayRef<int> ReuseShuffleIndices = {},
-                          ArrayRef<unsigned> ReorderIndices = {},
-                          unsigned InterleaveFactor = 0) {
+  TreeEntry *
+  newTreeEntry(ArrayRef<Value *> VL, std::optional<ScheduleData *> Bundle,
+               const InstructionsState &S, const EdgeInfo &UserTreeIdx,
+               ArrayRef<int> ReuseShuffleIndices = {},
+               ArrayRef<unsigned> ReorderIndices = {},
+               unsigned InterleaveFactor = 0, bool IsAltOpCopy = false) {
     TreeEntry::EntryState EntryState =
         Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
-    TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
-                                ReuseShuffleIndices, ReorderIndices);
+    TreeEntry *E =
+        newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
+                     ReuseShuffleIndices, ReorderIndices, IsAltOpCopy);
     if (E && InterleaveFactor > 0)
       E->setInterleave(InterleaveFactor);
     return E;
   }
 
-  TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
-                          TreeEntry::EntryState EntryState,
-                          std::optional<ScheduleData *> Bundle,
-                          const InstructionsState &S,
-                          const EdgeInfo &UserTreeIdx,
-                          ArrayRef<int> ReuseShuffleIndices = {},
-                          ArrayRef<unsigned> ReorderIndices = {}) {
+  TreeEntry *newTreeEntry(
+      ArrayRef<Value *> VL, TreeEntry::EntryState EntryState,
+      std::optional<ScheduleData *> Bundle, const InstructionsState &S,
+      const EdgeInfo &UserTreeIdx, ArrayRef<int> ReuseShuffleIndices = {},
+      ArrayRef<unsigned> ReorderIndices = {}, bool IsAltOpCopy = false) {
     assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
             (Bundle && EntryState != TreeEntry::NeedToGather)) &&
            "Need to vectorize gather entry?");
@@ -3804,7 +3789,7 @@ class BoUpSLP {
     // for non-power-of-two vectors.
     assert(
         (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
-         S.isAltOpCopy() || ReuseShuffleIndices.empty()) &&
+         ReuseShuffleIndices.empty()) &&
         "Reshuffling scalars not yet supported for nodes with padding");
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
@@ -3829,6 +3814,8 @@ class BoUpSLP {
     if (!Last->isGather()) {
       SmallPtrSet<Value *, 4> Processed;
       unsigned Opcode = S.getOpcode();
+      if (IsAltOpCopy)
+        Last->setAltOpCopy(true);
       for (unsigned i = 0; i < VL.size(); ++i) {
         Value *V = VL[i];
         if (isa<PoisonValue>(V))
@@ -3841,7 +3828,7 @@ class BoUpSLP {
              doesNotNeedToBeScheduled(V)) &&
             "Scalar already in tree!");
         bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
-        if (S.isAltOpCopy() && IsAltInst) {
+        if (IsAltOpCopy && IsAltInst) {
           CopyableAltOp[V] = Last;
           continue;
         }
@@ -4418,17 +4405,16 @@ class BoUpSLP {
               DecrUnsched(I);
           // Handle a copy instruction dependencies.
           if (TE && TE->isAltOpCopy() && BundleMember->IsCopy) {
-            doForAllOpcodes(BundleMember->Inst, [BundleMember, &ReadyList](
-                                                    ScheduleData *CopyUse) {
-              if (BundleMember != CopyUse && CopyUse->hasValidDependencies() &&
+            doForAllOpcodes(BundleMember->Inst, [BundleMember, &ReadyList](ScheduleData *CopyUse) {
+              if (BundleMember != CopyUse &&
+                  CopyUse->hasValidDependencies() &&
                   CopyUse->incrementUnscheduledDeps(-1) == 0) {
                 ScheduleData *DepBundle = CopyUse->FirstInBundle;
                 assert(!DepBundle->IsScheduled &&
-                       "already scheduled bundle gets ready");
+                   "already scheduled bundle gets ready");
                 if (DepBundle->isReady()) {
                   ReadyList.insert(DepBundle);
-                  LLVM_DEBUG(dbgs() << "SLP:    gets ready (copyable): "
-                                    << *DepBundle << "\n");
+                  LLVM_DEBUG(dbgs() << "SLP:    gets ready (copyable): " << *DepBundle << "\n");
                 }
               }
             });
@@ -4519,7 +4505,8 @@ class BoUpSLP {
     /// Build a bundle from the ScheduleData nodes corresponding to the
     /// scalar instruction for each lane.
     ScheduleData *buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
-                              const InstructionsState &S, bool &ReSchedule);
+                              const InstructionsState &S, bool &ReSchedule,
+                              bool IsAltOpCopy);
 
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
@@ -4529,7 +4516,8 @@ class BoUpSLP {
     std::optional<ScheduleData *> tryScheduleBundle(ArrayRef<Value *> VL,
                                                     BoUpSLP *SLP,
                                                     const InstructionsState &S,
-                                                    bool AnyCopies);
+                                                    bool AnyCopies,
+                                                    bool IsAltOpCopy);
 
     /// Un-bundles a group of instructions.
     void cancelScheduling(ArrayRef<Value *> VL, ScheduleData *Bundle);
@@ -7937,8 +7925,6 @@ bool BoUpSLP::canRepresentAsCopyable(const InstructionsState &S,
   SmallVector<unsigned> MainAltOps;
   unsigned Operand;
 
-  if (!checkCopyableInnerDep(VL, S))
-    return false;
   if (isCopyableOp(VL, S.getMainOp(), S.getAltOp()))
     return true;
   if ((isa<BinaryOperator>(S.getMainOp()) && Opcode0 == Opcode1) ||
@@ -8507,11 +8493,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     return TreeEntry::Vectorize;
   }
   case Instruction::ShuffleVector: {
-    if (VectorizeCopyable && isCopyableOp(VL, S.getMainOp(), S.getAltOp()) &&
-        checkCopyableInnerDep(VL, S)) {
-      S.setAltOpCopy(true);
-      return TreeEntry::Vectorize;
-    }
     if (!S.isAltShuffle()) {
       // REVEC can support non alternate shuffle.
       if (SLPReVec && getShufflevectorNumGroups(VL))
@@ -8528,9 +8509,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
              "the whole alt sequence is not profitable.\n");
       return TreeEntry::NeedToGather;
     }
-    if (VectorizeCopyable && canRepresentAsCopyable(S, VL))
-      S.setAltOpCopy(true);
-
     return TreeEntry::Vectorize;
   }
   default:
@@ -8708,6 +8686,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   };
 
   InstructionsState S = getSameOpcode(VL, *TLI);
+  bool IsAltOpCopy = false;
+  if (!S && VectorizeCopyable) {
+    S = getCopyableOpcode(VL, *TLI);
+    if (S) {
+      if (isCopyableOp(VL, S.getMainOp(), S.getAltOp()) ||
+          canRepresentAsCopyable(S, VL)) {
+        IsAltOpCopy = true;
+      } else {
+        S = InstructionsState::invalid();
+      }
+    }
+  } else if (VectorizeCopyable && canRepresentAsCopyable(S, VL))
+    IsAltOpCopy = true;
 
   // Don't go into catchswitch blocks, which can happen with PHIs.
   // Such blocks can only have PHIs and the catchswitch.  There is no
@@ -8954,7 +8945,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   SmallVector<Value *> PointerOps;
   TreeEntry::EntryState State = getScalarsVectorizationState(
       S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
-  if (S.isAltOpCopy()) {
+  if (IsAltOpCopy) {
+    State = TreeEntry::Vectorize;
     for (Value *V : VL) {
       Instruction *I = dyn_cast<Instruction>(V);
       if (!I)
@@ -8967,7 +8959,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
   }
   if (State == TreeEntry::NeedToGather ||
-      (S.isAltOpCopy() && !has_single_bit(UniqueValues.size()))) {
+      (IsAltOpCopy && !has_single_bit(UniqueValues.size()))) {
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
                  ReuseShuffleIndices);
     return;
@@ -8985,15 +8977,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
   BlockScheduling &BS = *BSRef;
   std::optional<ScheduleData *> Bundle;
-  Bundle = BS.tryScheduleBundle(UniqueValues, this, S, AnyCopies);
+  Bundle = BS.tryScheduleBundle(UniqueValues, this, S, AnyCopies, IsAltOpCopy);
 #ifdef EXPENSIVE_CHECKS
   // Make sure we didn't break any internal invariants
   BS.verify();
 #endif
-  if (!Bundle || (S.isAltOpCopy() && !Bundle.value())) {
+  if (!Bundle || (IsAltOpCopy && !Bundle.value())) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
     assert((!BS.getScheduleData(VL0) ||
-            !BS.getScheduleData(VL0)->isPartOfBundle() || S.isAltOpCopy() ||
+            !BS.getScheduleData(VL0)->isPartOfBundle() ||
             (BS.getScheduleData(VL0)->TE &&
              BS.getScheduleData(VL0)->TE->isAltOpCopy())) &&
            "tryScheduleBundle should cancelScheduling on failure");
@@ -9386,7 +9378,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
     case Instruction::ShuffleVector: {
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndices);
+                                   ReuseShuffleIndices, {}, 0, IsAltOpCopy);
       if (S.isAltShuffle()) {
         LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
                    TE->dump());
@@ -9396,7 +9388,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
             TE->dump());
       }
-      if (S.isAltOpCopy() && !isCopyableOp(VL, S.getMainOp(), S.getAltOp())) {
+      if (IsAltOpCopy && !isCopyableOp(VL, S.getMainOp(), S.getAltOp())) {
         ValueList Left, Right;
         unsigned Opcode0 = S.getOpcode();
         unsigned Opcode1 = S.getAltOpcode();
@@ -9440,7 +9432,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         buildTree_rec(newVL, Depth + 1, {TE, 0});
         buildTree_rec(Right, Depth + 1, {TE, 1});
         return;
-      } else if (S.isAltOpCopy()) {
+      } else if (IsAltOpCopy) {
         ValueList Left, Right;
         unsigned Opcode0 = S.getOpcode();
         VLOperands Ops(VL, S, *this);
@@ -11622,12 +11614,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
 const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
                                                    unsigned Idx) const {
   ArrayRef<Value *> VL = E->getOperand(Idx);
-  InstructionsState S = getSameOpcode(VL, *TLI);
+  InstructionsState S = getCombinedOpcode(VL, *TLI);
   // Special processing for GEPs bundle, which may include non-gep values.
   if (!S && VL.front()->getType()->isPointerTy()) {
     const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
     if (It != VL.end())
-      S = getSameOpcode(*It, *TLI);
+      S = getCombinedOpcode(*It, *TLI);
   }
   if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx, VL, S))
     return VE;
@@ -13166,8 +13158,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
         std::optional<unsigned> InsertIdx = getElementIndex(VU);
         if (InsertIdx) {
           const TreeEntry *ScalarTE = &EU.E;
-          if (!ScalarTE)
-            continue;
           auto *It = find_if(
               ShuffledInserts,
               [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
@@ -15285,12 +15275,12 @@ BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
 
 Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
   ValueList &VL = E->getOperand(NodeIdx);
-  InstructionsState S = getSameOpcode(VL, *TLI);
+  InstructionsState S = getCombinedOpcode(VL, *TLI);
   // Special processing for GEPs bundle, which may include non-gep values.
   if (!S && VL.front()->getType()->isPointerTy()) {
     const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
     if (It != VL.end())
-      S = getSameOpcode(*It, *TLI);
+      S = getCombinedOpcode(*It, *TLI);
   }
   const unsigned VF = VL.size();
   if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx, VL, S)) {
@@ -17629,13 +17619,15 @@ void BoUpSLP::optimizeGatherSequence() {
 BoUpSLP::ScheduleData *
 BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                                       const InstructionsState &S,
-                                      bool &ReSchedule) {
+                                      bool &ReSchedule, bool IsAltOpCopy) {
   ScheduleData *Bundle = nullptr;
   ScheduleData *PrevInBundle = nullptr;
   unsigned Opcode = S.getOpcode();
   ValueList Keys;
 
   for (Value *V : VL) {
+    if (doesNotNeedToBeScheduled(V) && IsAltOpCopy)
+      return nullptr;
     auto *SD = getScheduleData(V);
     bool FoundKey = false;
     if (SD && !SD->isPartOfBundle()) {
@@ -17694,7 +17686,7 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
     // Group the instructions to a bundle.
     BundleMember->FirstInBundle = Bundle;
-    if (S.isAltOpCopy() && IsAltInst)
+    if (IsAltOpCopy && IsAltInst)
       BundleMember->IsCopy = true;
     PrevInBundle = BundleMember;
     if (SLP->CopyableAltOp.contains(I)) {
@@ -17714,7 +17706,7 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 std::optional<BoUpSLP::ScheduleData *>
 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                                             const InstructionsState &S,
-                                            bool AnyCopies) {
+                                            bool AnyCopies, bool IsAltOpCopy) {
   // No need to schedule PHIs, insertelement, extractelement and extractvalue
   // instructions.
   if (isa<PHINode>(S.getMainOp()) ||
@@ -17723,7 +17715,6 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
   // Initialize the instruction bundle.
   Instruction *OldScheduleEnd = ScheduleEnd;
-  bool IsAltOpCopy = S.isAltOpCopy();
   LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.getMainOp() << "\n");
 
   auto TryScheduleBundleImpl = [this, OldScheduleEnd, IsAltOpCopy, AnyCopies,
@@ -17801,7 +17792,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     ReSchedule = true;
   }
 
-  auto *Bundle = buildBundle(VL, SLP, S, ReSchedule);
+  auto *Bundle = buildBundle(VL, SLP, S, ReSchedule, IsAltOpCopy);
   if (!Bundle)
     return std::nullopt;
   TryScheduleBundleImpl(ReSchedule, Bundle);
@@ -18051,9 +18042,20 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
         if (auto *I = dyn_cast<Instruction>(U)) {
           doForAllOpcodes(I, [&](ScheduleData *UseSD) {
             ScheduleData *DestBundle = UseSD->FirstInBundle;
-            if ((UseSD->IsCopy || BundleMember->IsCopy) &&
+            if (BundleMember->IsCopy && !UseSD->IsCopy &&
                 DestBundle == BundleMember->FirstInBundle)
               return;
+            // For copy operations check for inner dependencies, we could not
+            // support such depenedies if it comes from a main operaion, only
+            // from alternative or for now we ignore alternative operations
+            // depenedies to any alternative.
+            if (BundleMember->TE && BundleMember->TE->isAltOpCopy() &&
+                (!BundleMember->IsCopy ||
+                 (BundleMember->IsCopy && UseSD->IsCopy)) &&
+                DestBundle == BundleMember->FirstInBundle) {
+              BundleMember->Dependencies++;
+              BundleMember->incrementUnscheduledDeps(1);
+            }
             BundleMember->Dependencies++;
             if (!DestBundle->IsScheduled)
               BundleMember->incrementUnscheduledDeps(1);
@@ -18243,15 +18245,13 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
     }
   };
   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
-  SmallDenseMap<Value *, ScheduleData *> CopyElementsMap;
 
   // Ensure that all dependency data is updated (for nodes in the sub-graph)
   // and fill the ready-list with initial instructions.
   int Idx = 0;
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
-    BS->doForAllOpcodes(I, [this, &Idx, &CopyElementsMap,
-                            BS](ScheduleData *SD) {
+    BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
       [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(SD->Inst);
       assert((isVectorLikeInstWithConstOps(SD->Inst) || SD->IsCopy ||
               SD->isPartOfBundle() ==
@@ -18259,16 +18259,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
                    !doesNotNeedToSchedule(SDTEs.front()->Scalars))) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
-      for (TreeEntry *SDTE : SDTEs)
-        if (SDTE && SDTE->isAltOpCopy()) {
-          ScheduleData *Bundle = SD->FirstInBundle;
-          for (ScheduleData *BundleMember = Bundle; BundleMember;
-               BundleMember = BundleMember->NextInBundle) {
-            if (BundleMember->IsCopy)
-              CopyElementsMap[BundleMember->Inst] = Bundle;
-          }
-        }
-
       if (SD->isSchedulingEntity() && SD->isPartOfBundle())
         BS->calculateDependencies(SD, false, this);
     });
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 7fa746dc758a9..917ad682e26cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -63,9 +63,18 @@ define void @add1(ptr noalias %dst, ptr noalias %src) {
 ;
 ; COPYABLE-LABEL: @add1(
 ; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
-; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT:    store i32 [[TMP0]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; COPYABLE-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; COPYABLE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT:    [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 1, i32 2>
+; COPYABLE-NEXT:    store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; COPYABLE-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
+; COPYABLE-NEXT:    store i32 [[ADD9]], ptr [[INCDEC_PTR7]], align 4
 ; COPYABLE-NEXT:    ret void
 ;
 entry:
@@ -90,44 +99,21 @@ entry:
 }
 
 define void @sub0(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @sub0(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; NON-POW2-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; NON-POW2-NEXT:    store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @sub0(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; POW2-ONLY-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; POW2-ONLY-NEXT:    store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
-; POW2-ONLY-NEXT:    ret void
-;
-; COPYABLE-LABEL: @sub0(
-; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 -1, i32 0, i32 -2, i32 -3>
-; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
-; COPYABLE-NEXT:    ret void
+; CHECK-LABEL: @sub0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -211,55 +197,23 @@ entry:
 }
 
 define void @addsub0(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @addsub0(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; NON-POW2-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; NON-POW2-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; NON-POW2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; NON-POW2-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @addsub0(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; POW2-ONLY-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; POW2-ONLY-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; POW2-ONLY-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; POW2-ONLY-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; POW2-ONLY-NEXT:    ret void
-;
-; COPYABLE-LABEL: @addsub0(
-; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; COPYABLE-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 0>
-; COPYABLE-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST]], align 4
-; COPYABLE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; COPYABLE-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; COPYABLE-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; COPYABLE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; COPYABLE-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; COPYABLE-NEXT:    ret void
+; CHECK-LABEL: @addsub0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -283,55 +237,23 @@ entry:
 }
 
 define void @addsub1(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @addsub1(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; NON-POW2-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; NON-POW2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; NON-POW2-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; NON-POW2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; NON-POW2-NEXT:    store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; NON-POW2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
-; NON-POW2-NEXT:    store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @addsub1(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; POW2-ONLY-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; POW2-ONLY-NEXT:    [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
-; POW2-ONLY-NEXT:    store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; POW2-ONLY-NEXT:    ret void
-;
-; COPYABLE-LABEL: @addsub1(
-; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; COPYABLE-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; COPYABLE-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; COPYABLE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; COPYABLE-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
-; COPYABLE-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; COPYABLE-NEXT:    [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP4]], <i32 0, i32 -3>
-; COPYABLE-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; COPYABLE-NEXT:    ret void
+; CHECK-LABEL: @addsub1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
+; CHECK-NEXT:    store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -355,44 +277,21 @@ entry:
 }
 
 define void @mul(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @mul(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
-; NON-POW2-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; NON-POW2-NEXT:    store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
-; NON-POW2-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
-; NON-POW2-NEXT:    store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @mul(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
-; POW2-ONLY-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
-; POW2-ONLY-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
-; POW2-ONLY-NEXT:    store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
-; POW2-ONLY-NEXT:    ret void
-;
-; COPYABLE-LABEL: @mul(
-; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[TMP0]], <i32 257, i32 -3, i32 1, i32 -9>
-; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
-; COPYABLE-NEXT:    ret void
+; CHECK-LABEL: @mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; CHECK-NEXT:    store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -445,9 +344,18 @@ define void @shl0(ptr noalias %dst, ptr noalias %src) {
 ;
 ; COPYABLE-LABEL: @shl0(
 ; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
-; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT:    store i32 [[TMP0]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; COPYABLE-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; COPYABLE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 2>
+; COPYABLE-NEXT:    store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; COPYABLE-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
+; COPYABLE-NEXT:    store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
 ; COPYABLE-NEXT:    ret void
 ;
 entry:
@@ -561,9 +469,18 @@ define void @add1f(ptr noalias %dst, ptr noalias %src) {
 ;
 ; COPYABLE-LABEL: @add1f(
 ; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; COPYABLE-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; COPYABLE-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; COPYABLE-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; COPYABLE-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; COPYABLE-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; COPYABLE-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
 ; COPYABLE-NEXT:    ret void
 ;
 entry:
@@ -588,44 +505,21 @@ entry:
 }
 
 define void @sub0f(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @sub0f(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store float [[ADD]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; NON-POW2-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; NON-POW2-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @sub0f(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store float [[ADD]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; POW2-ONLY-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT:    ret void
-;
-; COPYABLE-LABEL: @sub0f(
-; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
-; COPYABLE-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; COPYABLE-NEXT:    ret void
+; CHECK-LABEL: @sub0f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -709,55 +603,23 @@ entry:
 }
 
 define void @addsub0f(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @addsub0f(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store float [[SUB]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; NON-POW2-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; NON-POW2-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; NON-POW2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; NON-POW2-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @addsub0f(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store float [[SUB]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; POW2-ONLY-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; POW2-ONLY-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; POW2-ONLY-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; POW2-ONLY-NEXT:    ret void
-;
-; COPYABLE-LABEL: @addsub0f(
-; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; COPYABLE-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00>
-; COPYABLE-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; COPYABLE-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; COPYABLE-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; COPYABLE-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; COPYABLE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; COPYABLE-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; COPYABLE-NEXT:    ret void
+; CHECK-LABEL: @addsub0f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -781,55 +643,23 @@ entry:
 }
 
 define void @addsub1f(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @addsub1f(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; NON-POW2-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; NON-POW2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; NON-POW2-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; NON-POW2-NEXT:    [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; NON-POW2-NEXT:    store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; NON-POW2-NEXT:    [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
-; NON-POW2-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @addsub1f(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; POW2-ONLY-NEXT:    [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
-; POW2-ONLY-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; POW2-ONLY-NEXT:    ret void
-;
-; COPYABLE-LABEL: @addsub1f(
-; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; COPYABLE-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; COPYABLE-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; COPYABLE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; COPYABLE-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; COPYABLE-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; COPYABLE-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP4]], <float 0.000000e+00, float -3.000000e+00>
-; COPYABLE-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; COPYABLE-NEXT:    ret void
+; CHECK-LABEL: @addsub1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
+; CHECK-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -853,44 +683,21 @@ entry:
 }
 
 define void @mulf(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @mulf(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; NON-POW2-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; NON-POW2-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; NON-POW2-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; NON-POW2-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @mulf(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; POW2-ONLY-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; POW2-ONLY-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
-; POW2-ONLY-NEXT:    ret void
-;
-; COPYABLE-LABEL: @mulf(
-; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT:    [[TMP1:%.*]] = fmul fast <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
-; COPYABLE-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; COPYABLE-NEXT:    ret void
+; CHECK-LABEL: @mulf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -1107,49 +914,21 @@ entry:
 }
 
 define void @mulfn(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @mulfn(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; NON-POW2-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; NON-POW2-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; NON-POW2-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; NON-POW2-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @mulfn(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; POW2-ONLY-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; POW2-ONLY-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
-; POW2-ONLY-NEXT:    ret void
-;
-; COPYABLE-LABEL: @mulfn(
-; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; COPYABLE-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; COPYABLE-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; COPYABLE-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; COPYABLE-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; COPYABLE-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], <float 1.000000e+00, float -9.000000e+00>
-; COPYABLE-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
-; COPYABLE-NEXT:    ret void
+; CHECK-LABEL: @mulfn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1

>From d6199a589a1e76526571b2a4f96272ec8e7b40f6 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at gmail.com>
Date: Mon, 3 Mar 2025 22:21:34 +0000
Subject: [PATCH 4/9] Fix formatting.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 24f47f5abd692..bd3318839015a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4405,16 +4405,17 @@ class BoUpSLP {
               DecrUnsched(I);
           // Handle a copy instruction dependencies.
           if (TE && TE->isAltOpCopy() && BundleMember->IsCopy) {
-            doForAllOpcodes(BundleMember->Inst, [BundleMember, &ReadyList](ScheduleData *CopyUse) {
-              if (BundleMember != CopyUse &&
-                  CopyUse->hasValidDependencies() &&
+            doForAllOpcodes(BundleMember->Inst, [BundleMember, &ReadyList](
+                                                    ScheduleData *CopyUse) {
+              if (BundleMember != CopyUse && CopyUse->hasValidDependencies() &&
                   CopyUse->incrementUnscheduledDeps(-1) == 0) {
                 ScheduleData *DepBundle = CopyUse->FirstInBundle;
                 assert(!DepBundle->IsScheduled &&
-                   "already scheduled bundle gets ready");
+                       "already scheduled bundle gets ready");
                 if (DepBundle->isReady()) {
                   ReadyList.insert(DepBundle);
-                  LLVM_DEBUG(dbgs() << "SLP:    gets ready (copyable): " << *DepBundle << "\n");
+                  LLVM_DEBUG(dbgs() << "SLP:    gets ready (copyable): "
+                                    << *DepBundle << "\n");
                 }
               }
             });

>From 7d7d26700888c9984cd7f017193720b67b9930b5 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at yahoo.com>
Date: Thu, 13 Mar 2025 12:42:48 +0000
Subject: [PATCH 5/9] Update llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Co-authored-by: Alexey Bataev <a.bataev at gmx.com>
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bd3318839015a..9b522e8a72e91 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1246,9 +1246,9 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
 
 // Determine that the vector could be vectorized with copyable elements.
 static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) {
-  if (any_of(VL, IsaPred<PoisonValue>) || Main == Alt ||
+  if (Main == Alt ||
       !isa<BinaryOperator>(Main) || !isa<Instruction>(Alt) ||
-      find_if(VL, IsaPred<PHINode>) != VL.end())
+      any_of(VL, IsaPred<PoisonValue, PHINode>))
     return false;
 
   Instruction *MainOp = cast<Instruction>(Main);

>From f48a1c7dac07d15d761c0d5574072e0a0f69c1bd Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at yahoo.com>
Date: Thu, 13 Mar 2025 12:43:10 +0000
Subject: [PATCH 6/9] Update llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Co-authored-by: Alexey Bataev <a.bataev at gmx.com>
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9b522e8a72e91..a3d0af0139692 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2627,7 +2627,7 @@ class BoUpSLP {
           }
           bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
-          Instruction *Inst = cast<Instruction>(VL[Lane]);
+          auto *Inst = cast<Instruction>(VL[Lane]);
           if (Inst->getOpcode() != MainOp->getOpcode() &&
               OpIdx > (Inst->getNumOperands() - 1)) {
             OpsVec[OpIdx][Lane] = {

>From 7c8363d3ba9ea18d7b444366ee3688b903fe8b92 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at gmail.com>
Date: Thu, 6 Mar 2025 09:18:50 +0000
Subject: [PATCH 7/9] Resolved comments.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 231 ++++++++----------
 1 file changed, 96 insertions(+), 135 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a3d0af0139692..d61d2f7ea7d42 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1148,20 +1148,19 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
 /// InstructionsState in propose to vectorize with copyable instructions.
 static InstructionsState getCopyableOpcode(ArrayRef<Value *> VL,
                                            const TargetLibraryInfo &TLI) {
-  if (!all_of(VL, IsaPred<Instruction>))
+  if (!all_of(VL, IsaPred<Instruction>) || !VectorizeCopyable)
     return InstructionsState::invalid();
-  Instruction *MainOp = dyn_cast<Instruction>(VL[0]);
+  Instruction *MainOp = cast<Instruction>(VL[0]);
   Instruction *AltOp = nullptr;
   unsigned Opcode = MainOp->getOpcode();
   unsigned AltOpcode = Opcode;
-  if (MainOp && VectorizeCopyable && all_of(VL, IsaPred<Instruction>)) {
+  auto getAltOp = [Opcode, &AltOpcode, &AltOp](ArrayRef<Value *> VL) {
     for (Value *V : VL) {
       Instruction *I = cast<Instruction>(V);
       if (I->isIntDivRem() || I->isFPDivRem())
-        return InstructionsState::invalid();
+        return false;
       if (isa<PHINode>(I)) {
-        AltOp = nullptr;
-        break;
+        return false;
       }
       unsigned VOpcode = I->getOpcode();
       if (VOpcode != Opcode) {
@@ -1169,34 +1168,35 @@ static InstructionsState getCopyableOpcode(ArrayRef<Value *> VL,
           AltOpcode = VOpcode;
           AltOp = I;
         }
-        if (VOpcode != AltOpcode) {
-          AltOp = nullptr;
-          break;
-        }
+        if (VOpcode != AltOpcode)
+          return false;
       }
     }
-    if (AltOp) {
-      bool IsBinOp = isa<BinaryOperator>(MainOp);
-      bool IsAltBinOp = isa<BinaryOperator>(AltOp);
-      if (!IsBinOp && IsAltBinOp) {
-        std::swap(MainOp, AltOp);
-        std::swap(IsBinOp, IsAltBinOp);
-        std::swap(Opcode, AltOpcode);
-      }
-      if ((IsBinOp || IsAltBinOp) && !(IsBinOp && IsAltBinOp) &&
-          isCopyableOp(VL, MainOp, AltOp)) {
-        SmallVector<Value *, 8> MainOps, AltOps;
-        for (Value *V : VL) {
-          Instruction *I = cast<Instruction>(V);
-          if (I->getOpcode() == Opcode)
-            MainOps.push_back(I);
-          else
-            AltOps.push_back(I);
-        }
-        if (getSameOpcode(MainOps, TLI) && getSameOpcode(AltOps, TLI))
-          return InstructionsState(MainOp, AltOp);
-      }
+    if (AltOp)
+      return true;
+    return false;
+  };
+  if (!getAltOp(VL))
+    return InstructionsState::invalid();
+  bool IsBinOp = isa<BinaryOperator>(MainOp);
+  bool IsAltBinOp = isa<BinaryOperator>(AltOp);
+  if (!IsBinOp && IsAltBinOp) {
+    std::swap(MainOp, AltOp);
+    std::swap(IsBinOp, IsAltBinOp);
+    std::swap(Opcode, AltOpcode);
+  }
+  if ((IsBinOp || IsAltBinOp) && !(IsBinOp && IsAltBinOp) &&
+      isCopyableOp(VL, MainOp, AltOp)) {
+    SmallVector<Value *, 8> MainOps, AltOps;
+    for (Value *V : VL) {
+      Instruction *I = cast<Instruction>(V);
+      if (I->getOpcode() == Opcode)
+        MainOps.push_back(I);
+      else
+        AltOps.push_back(I);
     }
+    if (getSameOpcode(MainOps, TLI) && getSameOpcode(AltOps, TLI))
+      return InstructionsState(MainOp, AltOp);
   }
   return InstructionsState::invalid();
 }
@@ -1246,8 +1246,7 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
 
 // Determine that the vector could be vectorized with copyable elements.
 static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) {
-  if (Main == Alt ||
-      !isa<BinaryOperator>(Main) || !isa<Instruction>(Alt) ||
+  if (Main == Alt || !isa<BinaryOperator>(Main) || !isa<Instruction>(Alt) ||
       any_of(VL, IsaPred<PoisonValue, PHINode>))
     return false;
 
@@ -1611,7 +1610,6 @@ class BoUpSLP {
     ScalarToTreeEntries.clear();
     MustGather.clear();
     NonScheduledFirst.clear();
-    CopyableAltOp.clear();
     EntryToLastInstruction.clear();
     LoadEntriesToVectorize.clear();
     IsGraphTransformMode = false;
@@ -1653,6 +1651,14 @@ class BoUpSLP {
     });
   }
 
+  TreeEntry *isCopiedValue(const Value *Op) const {
+    for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+      if (TE->isCopyOp(Op))
+        return TE.get();
+    }
+    return nullptr;
+  }
+
   /// Checks if the specified gather tree entry \p TE can be represented as a
   /// shuffled vector entry + (possibly) permutation with other gathers. It
   /// implements the checks only for possibly ordered scalars (Loads,
@@ -2580,7 +2586,8 @@ class BoUpSLP {
     }
 
     /// Go through the instructions in VL and append their operands.
-    void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
+    void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S,
+                            bool IsAltOpCopy) {
       assert(!VL.empty() && "Bad VL");
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
@@ -2627,12 +2634,16 @@ class BoUpSLP {
           }
           bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
-          auto *Inst = cast<Instruction>(VL[Lane]);
-          if (Inst->getOpcode() != MainOp->getOpcode() &&
-              OpIdx > (Inst->getNumOperands() - 1)) {
-            OpsVec[OpIdx][Lane] = {
-                PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
-                false};
+          if (IsAltOpCopy &&
+              MainOp->getOpcode() != cast<Instruction>(VL[Lane])->getOpcode()) {
+            assert(isa<BinaryOperator>(MainOp) && "Wrong operation");
+            if (OpIdx == 0)
+              OpsVec[0][Lane] = {VL[Lane], APO, false};
+            if (OpIdx == 1)
+              OpsVec[1][Lane] = {
+                  ConstantExpr::getBinOpIdentity(MainOp->getOpcode(),
+                                                 MainOp->getType(), true),
+                  APO, false};
           } else {
             OpsVec[OpIdx][Lane] = {
                 cast<Instruction>(VL[Lane])->getOperand(OpIdx), APO, false};
@@ -2743,11 +2754,11 @@ class BoUpSLP {
   public:
     /// Initialize with all the operands of the instruction vector \p RootVL.
     VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
-               const BoUpSLP &R)
+               const BoUpSLP &R, bool IsAltOpCopy = false)
         : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
           L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
       // Append all the operands of RootVL.
-      appendOperandsOfVL(RootVL, S);
+      appendOperandsOfVL(RootVL, S, IsAltOpCopy);
     }
 
     /// \Returns a value vector with the operands across all lanes for the
@@ -3566,7 +3577,7 @@ class BoUpSLP {
     }
 
     /// Some of the instructions in the list have alternate opcodes.
-    bool isAltShuffle() const { return S.isAltShuffle() && !IsAltOpCopy; }
+    bool isAltShuffle() const { return S.isAltShuffle(); }
 
     bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
 
@@ -3598,6 +3609,14 @@ class BoUpSLP {
 
     void setAltOpCopy(bool Val) { IsAltOpCopy = Val; }
 
+    bool isCopyOp(const Value *Op) const {
+      auto *I = dyn_cast<Instruction>(Op);
+      if (I && IsAltOpCopy && I->getOpcode() == S.getAltOpcode() &&
+          find(Scalars, Op) != Scalars.end())
+        return true;
+      return false;
+    }
+
     bool hasState() const { return S.valid(); }
 
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
@@ -3822,16 +3841,13 @@ class BoUpSLP {
           continue;
         auto It = ScalarToTreeEntries.find(V);
         Instruction *I = dyn_cast<Instruction>(V);
+        if (IsAltOpCopy && I->getOpcode() != Opcode)
+          continue;
         assert(
             (It == ScalarToTreeEntries.end() ||
              (It->getSecond().size() == 1 && It->getSecond().front() == Last) ||
              doesNotNeedToBeScheduled(V)) &&
             "Scalar already in tree!");
-        bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
-        if (IsAltOpCopy && IsAltInst) {
-          CopyableAltOp[V] = Last;
-          continue;
-        }
         if (It == ScalarToTreeEntries.end()) {
           ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
           (void)Processed.insert(V);
@@ -3941,9 +3957,6 @@ class BoUpSLP {
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
-  /// Maps a scalar copies to the its tree entry(ies).
-  SmallDenseMap<Value *, TreeEntry *> CopyableAltOp;
-
   /// A set of first non-schedulable values.
   ValueSet NonScheduledFirst;
 
@@ -4405,17 +4418,16 @@ class BoUpSLP {
               DecrUnsched(I);
           // Handle a copy instruction dependencies.
           if (TE && TE->isAltOpCopy() && BundleMember->IsCopy) {
-            doForAllOpcodes(BundleMember->Inst, [BundleMember, &ReadyList](
-                                                    ScheduleData *CopyUse) {
-              if (BundleMember != CopyUse && CopyUse->hasValidDependencies() &&
+            doForAllOpcodes(BundleMember->Inst, [BundleMember, &ReadyList](ScheduleData *CopyUse) {
+              if (BundleMember != CopyUse &&
+                  CopyUse->hasValidDependencies() &&
                   CopyUse->incrementUnscheduledDeps(-1) == 0) {
                 ScheduleData *DepBundle = CopyUse->FirstInBundle;
                 assert(!DepBundle->IsScheduled &&
-                       "already scheduled bundle gets ready");
+                   "already scheduled bundle gets ready");
                 if (DepBundle->isReady()) {
                   ReadyList.insert(DepBundle);
-                  LLVM_DEBUG(dbgs() << "SLP:    gets ready (copyable): "
-                                    << *DepBundle << "\n");
+                  LLVM_DEBUG(dbgs() << "SLP:    gets ready (copyable): " << *DepBundle << "\n");
                 }
               }
             });
@@ -7925,6 +7937,7 @@ bool BoUpSLP::canRepresentAsCopyable(const InstructionsState &S,
   DenseMap<unsigned, unsigned> AltOps;
   SmallVector<unsigned> MainAltOps;
   unsigned Operand;
+  Instruction *NewAlt = nullptr;
 
   if (isCopyableOp(VL, S.getMainOp(), S.getAltOp()))
     return true;
@@ -7950,6 +7963,14 @@ bool BoUpSLP::canRepresentAsCopyable(const InstructionsState &S,
           if (!AltOps.size())
             Operand = Op;
           AltOps[I] = Op;
+        } else {
+          if (!NewAlt) {
+            NewAlt = Inst1;
+            if (!tryToRepresentAsInstArg(S.getAltOpcode(), Inst1))
+              return false;
+          }
+          if (NewAlt->getOpcode() != Inst1->getOpcode())
+            return false;
         }
       }
     } else if (Inst->getOpcode() == Opcode1) {
@@ -8952,7 +8973,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       Instruction *I = dyn_cast<Instruction>(V);
       if (!I)
         continue;
-      if (I->getOpcode() == S.getAltOpcode() && CopyableAltOp.contains(V)) {
+      if (I->getOpcode() == S.getAltOpcode() && isCopiedValue(V)) {
         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndices);
         return;
@@ -9389,72 +9410,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
             TE->dump());
       }
-      if (IsAltOpCopy && !isCopyableOp(VL, S.getMainOp(), S.getAltOp())) {
+      if (IsAltOpCopy) {
         ValueList Left, Right;
-        unsigned Opcode0 = S.getOpcode();
-        unsigned Opcode1 = S.getAltOpcode();
-        unsigned Operand;
-        bool IsOperandSet = false;
-        ValueList newMainVL;
-        ValueList newVL;
-        for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
-          Instruction *Inst = cast<Instruction>(VL[I]);
-          if (Inst->getOpcode() == Opcode0) {
-            newMainVL.push_back(VL[I]);
-            unsigned Op = 0;
-            Instruction *Inst1 = dyn_cast<Instruction>(Inst->getOperand(Op));
-            if (!Inst1) {
-              newVL.push_back(Inst->getOperand(Op));
-              continue;
-            }
-            if (IsOperandSet && Op != Operand)
-              return;
-            if (Inst1->getOpcode() == Opcode1) {
-              if (!IsOperandSet) {
-                Operand = Op;
-                IsOperandSet = true;
-              }
-            }
-            newVL.push_back(Inst1);
-          } else if (Inst->getOpcode() == Opcode1) {
-            newVL.push_back(Inst);
-          }
-        }
-        VLOperands Ops(VL, S, *this);
+        VLOperands Ops(VL, S, *this, true /*IsAltOpCopy*/);
         Left = Ops.getVL(0);
         Right = Ops.getVL(1);
-        for (unsigned I = 0, VF = VL.size(); I < VF; ++I)
-          if ((cast<Instruction>(VL[I]))->getOpcode() != Opcode0) {
-            Right[I] = ConstantExpr::getBinOpIdentity(
-                Opcode0, Right[0]->getType(), true);
-          }
-        TE->setOperand(0, newVL);
+        TE->setOperand(0, Left);
         TE->setOperand(1, Right);
-        buildTree_rec(newVL, Depth + 1, {TE, 0});
+        buildTree_rec(Left, Depth + 1, {TE, 0});
         buildTree_rec(Right, Depth + 1, {TE, 1});
         return;
-      } else if (IsAltOpCopy) {
-        ValueList Left, Right;
-        unsigned Opcode0 = S.getOpcode();
-        VLOperands Ops(VL, S, *this);
-        Left = Ops.getVL(0);
-        Right = Ops.getVL(1);
-        ValueList Left_new, Right_new;
-        for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
-          if ((cast<Instruction>(VL[I]))->getOpcode() != Opcode0) {
-            Left_new.push_back(VL[I]);
-            Right_new.push_back(ConstantExpr::getBinOpIdentity(
-                Opcode0, S.getMainOp()->getType(), true));
-          } else {
-            Left_new.push_back(Left[I]);
-            Right_new.push_back(Right[I]);
-          }
-        }
-        TE->setOperand(0, Left_new);
-        TE->setOperand(1, Right_new);
-        buildTree_rec(Left_new, Depth + 1, {TE, 0});
-        buildTree_rec(Right_new, Depth + 1, {TE, 1});
-        return;
       }
       // Reorder operands if reordering would enable vectorization.
       auto *CI = dyn_cast<CmpInst>(VL0);
@@ -11721,9 +11686,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
            E->getMainOp()->getType()->isPointerTy())) &&
          "Invalid VL");
   Instruction *VL0 = E->getMainOp();
-  unsigned ShuffleOrOp = (E->isAltShuffle() && !E->isAltOpCopy())
-                             ? (unsigned)Instruction::ShuffleVector
-                             : E->getOpcode();
+  unsigned ShuffleOrOp =
+      (E->isAltShuffle() && !E->isAltOpCopy() && !E->isAltOpCopy())
+          ? (unsigned)Instruction::ShuffleVector
+          : E->getOpcode();
   if (E->CombinedOp != TreeEntry::NotCombinedOp)
     ShuffleOrOp = E->CombinedOp;
   SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
@@ -14513,12 +14479,10 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
       V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
     auto *Bundle = BlocksSchedules[BB]->getScheduleData(V, E);
     if (Bundle && Bundle->isPartOfBundle()) {
-      if (any_of(E->Scalars, [&](Value *V) {
-            return (!doesNotNeedToBeScheduled(V) && CopyableAltOp.contains(V));
-          }))
+      if (E->isAltOpCopy())
         Bundle = Bundle->FirstInBundle;
       for (; Bundle; Bundle = Bundle->NextInBundle)
-        if (!CopyableAltOp.contains(Bundle->Inst) &&
+        if ((!Bundle->IsCopy && !Bundle->CopyInst) &&
             !doesNotNeedToBeScheduled(Bundle->Inst))
           Res = Bundle->Inst;
     }
@@ -15934,8 +15898,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   };
 
   assert(!E->isGather() && "Unhandled state");
-  unsigned ShuffleOrOp =
-      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+  unsigned ShuffleOrOp = (E->isAltShuffle() && !E->isAltOpCopy())
+                             ? (unsigned)Instruction::ShuffleVector
+                             : E->getOpcode();
   Instruction *VL0 = E->getMainOp();
   auto GetOperandSignedness = [&](unsigned Idx) {
     const TreeEntry *OpE = getOperandEntry(E, Idx);
@@ -16985,7 +16950,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
     if (User && !is_contained(Scalar->users(), User))
       continue;
     const TreeEntry *E = &ExternalUse.E;
-    if (!E && CopyableAltOp.contains(Scalar))
+    if (!E && isCopiedValue(Scalar))
       continue;
     assert(E && "Invalid scalar");
     assert(!E->isGather() && "Extracting from a gather list");
@@ -17690,8 +17655,7 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     if (IsAltOpCopy && IsAltInst)
       BundleMember->IsCopy = true;
     PrevInBundle = BundleMember;
-    if (SLP->CopyableAltOp.contains(I)) {
-      TreeEntry *TE = SLP->CopyableAltOp[I];
+    if (TreeEntry *TE = SLP->isCopiedValue(I)) {
       assert(TE && "Incorrect state");
       ScheduleData *SD = getScheduleData(I, TE);
       assert(SD && SD->IsCopy && "ScheduleData incorrect state");
@@ -18285,10 +18249,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
       SmallVector<Instruction *, 2> InstrSched;
       for (ScheduleData *BundleMember = SD; BundleMember;
            BundleMember = BundleMember->NextInBundle) {
-        if (CopyableAltOp.contains(BundleMember->Inst))
-          Insts.insert(Insts.begin(), BundleMember->Inst);
-        else
-          Insts.push_back(BundleMember->Inst);
+        Insts.push_back(BundleMember->Inst);
       }
     }
     return Insts;

>From de9de630e0d22811a72b76cd4cd0e721f7191e42 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at yahoo.com>
Date: Wed, 19 Mar 2025 21:09:41 +0000
Subject: [PATCH 8/9] Update llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Co-authored-by: Alexey Bataev <a.bataev at gmx.com>
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d61d2f7ea7d42..680865b1f2c01 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1250,8 +1250,8 @@ static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) {
       any_of(VL, IsaPred<PoisonValue, PHINode>))
     return false;
 
-  Instruction *MainOp = cast<Instruction>(Main);
-  Instruction *AltOp = cast<Instruction>(Alt);
+  auto *MainOp = cast<Instruction>(Main);
+  auto *AltOp = cast<Instruction>(Alt);
 
   if (isa<BinaryOperator>(MainOp) && !isa<BinaryOperator>(AltOp) &&
       isValidForAlternation(MainOp->getOpcode()) &&

>From 4d604f68c7cac90b6f356b4cc402a369ce3cd048 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at yahoo.com>
Date: Wed, 19 Mar 2025 21:10:27 +0000
Subject: [PATCH 9/9] Update llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Co-authored-by: Alexey Bataev <a.bataev at gmx.com>
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 680865b1f2c01..e91071c4a223e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1244,7 +1244,7 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
   }
 }
 
-// Determine that the vector could be vectorized with copyable elements.
+/// Determine that the vector could be vectorized with copyable elements.
 static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) {
   if (Main == Alt || !isa<BinaryOperator>(Main) || !isa<Instruction>(Alt) ||
       any_of(VL, IsaPred<PoisonValue, PHINode>))



More information about the llvm-commits mailing list