[llvm] [WIP][SLP] SLP's copyable elements based upon Main/Alt operations. (PR #124242)

Dinar Temirbulatov via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 31 04:07:51 PST 2025


https://github.com/dtemirbulatov updated https://github.com/llvm/llvm-project/pull/124242

>From 6b92c26d790e0172be9df8f2f034e3f2d7cd0a8e Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at gmail.com>
Date: Sat, 18 Jan 2025 21:01:52 +0000
Subject: [PATCH 1/2] [WIP][SLP] SLP's copyable elements based upon Main/Alt
 operations.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 763 +++++++++++++++---
 .../X86/vect_copyable_in_binops.ll            | 534 +++++++++---
 2 files changed, 1084 insertions(+), 213 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c98d872fb6467f..47b61496b5e155 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -201,6 +201,10 @@ static cl::opt<bool> VectorizeNonPowerOf2(
     "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
     cl::desc("Try to vectorize with non-power-of-2 number of elements."));
 
+static cl::opt<bool>
+    VectorizeCopyable("slp-vectorize-copyable", cl::init(false), cl::Hidden,
+                      cl::desc("Try to vectorize with copyable elements."));
+
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
@@ -426,6 +430,8 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
   if (isa<ExtractElementInst>(I))
     return isConstant(I->getOperand(1));
   assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
+  if (I->getNumOperands() < 2)
+    return false;
   return isConstant(I->getOperand(2));
 }
 
@@ -594,6 +600,41 @@ static std::optional<unsigned> getElementIndex(const Value *Inst,
   return Index;
 }
 
+/// Checks if the \p Opcode can be considered as an operand of a (possibly)
+/// binary operation \p I.
+/// \returns The code of the binary operation of instruction \p I if the
+/// instruction with \p Opcode can be considered as an operand of \p I with the
+/// default value.
+static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) {
+  if (Opcode != Instruction::PHI && Opcode != Instruction::Invoke &&
+      !isa<FPMathOperator>(I) &&
+      ((I->getType()->isIntegerTy() &&
+        (I->getOpcode() == Instruction::Add ||
+         I->getOpcode() == Instruction::And ||
+         I->getOpcode() == Instruction::AShr ||
+         I->getOpcode() == Instruction::BitCast ||
+         I->getOpcode() == Instruction::Call ||
+         // Issue with scheduling with isVectorLikeInstWithConstOps
+         // operations.
+         // I->getOpcode() == Instruction::ExtractElement ||
+         // I->getOpcode() == Instruction::ExtractValue ||
+         I->getOpcode() == Instruction::ICmp ||
+         I->getOpcode() == Instruction::Load ||
+         I->getOpcode() == Instruction::LShr ||
+         I->getOpcode() == Instruction::Mul ||
+         I->getOpcode() == Instruction::Or ||
+         I->getOpcode() == Instruction::PtrToInt ||
+         I->getOpcode() == Instruction::Select ||
+         I->getOpcode() == Instruction::SExt ||
+         I->getOpcode() == Instruction::Shl ||
+         I->getOpcode() == Instruction::Sub ||
+         I->getOpcode() == Instruction::Trunc ||
+         I->getOpcode() == Instruction::Xor ||
+         I->getOpcode() == Instruction::ZExt))))
+    return I->getOpcode();
+  return 0;
+}
+
 namespace {
 /// Specifies the way the mask should be analyzed for undefs/poisonous elements
 /// in the shuffle mask.
@@ -853,6 +894,16 @@ class InstructionsState {
 
 } // end anonymous namespace
 
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(const InstructionsState &S, Value *Op) {
+  auto *I = dyn_cast<Instruction>(Op);
+  if (I && S.isOpcodeOrAlt(I))
+    return Op;
+  return S.getMainOp();
+}
+
 /// \returns true if \p Opcode is allowed as part of the main/alternate
 /// instruction for SLP vectorization.
 ///
@@ -865,6 +916,14 @@ static bool isValidForAlternation(unsigned Opcode) {
   return true;
 }
 
+// Check for inner dependencies, we could not support such depenedies if it
+// comes from a main operaion, only from an alternative.
+static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
+                                  const InstructionsState &S);
+
+// Determine that the vector could be vectorized with copyable elements.
+static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt);
+
 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                                        const TargetLibraryInfo &TLI);
 
@@ -917,19 +976,53 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     return InstructionsState::invalid();
 
   Instruction *MainOp = cast<Instruction>(*It);
+  Instruction *AltOp = MainOp;
+  unsigned Opcode = MainOp->getOpcode();
+  unsigned AltOpcode = Opcode;
+  for (Value *V : iterator_range(It + 1, VL.end())) {
+    Instruction *Inst = dyn_cast<Instruction>(V);
+    if (!Inst)
+      continue;
+    unsigned VOpcode = Inst->getOpcode();
+    if (Inst && AltOpcode == Opcode && !isa<PHINode>(Inst) &&
+        VOpcode != Opcode && isValidForAlternation(VOpcode)) {
+      AltOpcode = VOpcode;
+      AltOp = Inst;
+      break;
+    }
+  }
   unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
   if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
       (VL.size() == 2 && InstCnt < 2))
     return InstructionsState::invalid();
+  bool IsBinOp = isa<BinaryOperator>(MainOp);
+  bool IsCopyable = false;
 
+  if (MainOp && AltOp && MainOp != AltOp) {
+    if (!IsBinOp && isa<BinaryOperator>(AltOp) && !isa<PHINode>(MainOp)) {
+      std::swap(MainOp, AltOp);
+      std::swap(AltOpcode, Opcode);
+      IsBinOp = true;
+    }
+    IsCopyable = VectorizeCopyable && isCopyableOp(VL, MainOp, AltOp);
+    if (IsCopyable && isa<CmpInst>(AltOp)) {
+      Type *Ty0 = MainOp->getOperand(0)->getType();
+      Type *Ty1 = AltOp->getOperand(0)->getType();
+      if (Ty0 != Ty1)
+        return InstructionsState::invalid();
+    }
+    if (!IsCopyable) {
+      MainOp = cast<Instruction>(*It);
+      AltOp = MainOp;
+      Opcode = MainOp->getOpcode();
+      AltOpcode = Opcode;
+      IsBinOp = isa<BinaryOperator>(MainOp);
+    }
+  }
   bool IsCastOp = isa<CastInst>(MainOp);
-  bool IsBinOp = isa<BinaryOperator>(MainOp);
   bool IsCmpOp = isa<CmpInst>(MainOp);
   CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
                                         : CmpInst::BAD_ICMP_PREDICATE;
-  Instruction *AltOp = MainOp;
-  unsigned Opcode = MainOp->getOpcode();
-  unsigned AltOpcode = Opcode;
 
   bool SwappedPredsCompatible = IsCmpOp && [&]() {
     SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
@@ -984,7 +1077,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
         AltOp = I;
         continue;
       }
-    } else if (IsCastOp && isa<CastInst>(I)) {
+    } else if ((IsCastOp || IsCopyable) && isa<CastInst>(I)) {
       Value *Op0 = MainOp->getOperand(0);
       Type *Ty0 = Op0->getType();
       Value *Op1 = I->getOperand(0);
@@ -1001,13 +1094,15 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
           continue;
         }
       }
-    } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
+    } else if (auto *Inst = dyn_cast<CmpInst>(I);
+               Inst && (IsCmpOp || IsCopyable)) {
       auto *BaseInst = cast<CmpInst>(MainOp);
       Type *Ty0 = BaseInst->getOperand(0)->getType();
       Type *Ty1 = Inst->getOperand(0)->getType();
       if (Ty0 == Ty1) {
-        assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
-        assert(InstOpcode == AltOpcode &&
+        assert((IsCopyable || InstOpcode == Opcode) &&
+               "Expected same CmpInst opcode.");
+        assert((IsCopyable || InstOpcode == AltOpcode) &&
                "Alternate instructions are only supported by BinaryOperator "
                "and CastInst.");
         // Check for compatible operands. If the corresponding operands are not
@@ -1038,23 +1133,32 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
             AltPred == CurrentPred || AltPred == SwappedCurrentPred)
           continue;
       }
-    } else if (InstOpcode == Opcode) {
-      assert(InstOpcode == AltOpcode &&
+    } else if (InstOpcode == Opcode ||
+               (IsCopyable && InstOpcode == AltOpcode)) {
+      assert((IsCopyable || InstOpcode == AltOpcode) &&
              "Alternate instructions are only supported by BinaryOperator and "
              "CastInst.");
+      Instruction *Op = MainOp;
+      if (IsCopyable) {
+        if (InstOpcode != Opcode && InstOpcode != AltOpcode) {
+          Op = I;
+        } else if (Opcode != AltOpcode && InstOpcode == AltOpcode) {
+          Op = AltOp;
+        }
+      }
       if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
         if (Gep->getNumOperands() != 2 ||
-            Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
+            Gep->getOperand(0)->getType() != Op->getOperand(0)->getType())
           return InstructionsState::invalid();
       } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
         if (!isVectorLikeInstWithConstOps(EI))
           return InstructionsState::invalid();
       } else if (auto *LI = dyn_cast<LoadInst>(I)) {
-        auto *BaseLI = cast<LoadInst>(MainOp);
+        auto *BaseLI = cast<LoadInst>(Op);
         if (!LI->isSimple() || !BaseLI->isSimple())
           return InstructionsState::invalid();
       } else if (auto *Call = dyn_cast<CallInst>(I)) {
-        auto *CallBase = cast<CallInst>(MainOp);
+        auto *CallBase = cast<CallInst>(Op);
         if (Call->getCalledFunction() != CallBase->getCalledFunction())
           return InstructionsState::invalid();
         if (Call->hasOperandBundles() &&
@@ -1069,13 +1173,14 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
           return InstructionsState::invalid();
         if (!ID) {
           SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
-          if (Mappings.size() != BaseMappings.size() ||
-              Mappings.front().ISA != BaseMappings.front().ISA ||
-              Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
-              Mappings.front().VectorName != BaseMappings.front().VectorName ||
-              Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
-              Mappings.front().Shape.Parameters !=
-                  BaseMappings.front().Shape.Parameters)
+          if (Mappings.size() &&
+              (Mappings.size() != BaseMappings.size() ||
+               Mappings.front().ISA != BaseMappings.front().ISA ||
+               Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
+               Mappings.front().VectorName != BaseMappings.front().VectorName ||
+               Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
+               Mappings.front().Shape.Parameters !=
+                   BaseMappings.front().Shape.Parameters))
             return InstructionsState::invalid();
         }
       }
@@ -1124,6 +1229,46 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
   }
 }
 
+static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
+                                  const InstructionsState &S) {
+  SmallSet<Value *, 4> Ops;
+  unsigned Opcode = S.getOpcode();
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      continue;
+    if (I->getOpcode() == Opcode)
+      Ops.insert(V);
+  }
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      continue;
+    for (Use &U : I->operands()) {
+      if (auto *Op = dyn_cast<Instruction>(U.get()); Op && Ops.contains(Op))
+        return false;
+    }
+  }
+  return true;
+}
+
+static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) {
+  if (any_of(VL, IsaPred<PoisonValue>) || Main == Alt ||
+      !isa<BinaryOperator>(Main) || !isa<Instruction>(Alt) ||
+      find_if(VL, IsaPred<PHINode>) != VL.end())
+    return false;
+
+  Instruction *MainOp = cast<Instruction>(Main);
+  Instruction *AltOp = cast<Instruction>(Alt);
+
+  if (isa<BinaryOperator>(MainOp) && !isa<BinaryOperator>(AltOp) &&
+      isValidForAlternation(MainOp->getOpcode()) &&
+      isValidForAlternation(AltOp->getOpcode()) &&
+      tryToRepresentAsInstArg(MainOp->getOpcode(), AltOp) &&
+      tryToRepresentAsInstArg(AltOp->getOpcode(), MainOp))
+    return true;
+  return false;
+}
 /// \returns the AA location that is being access by the instruction.
 static MemoryLocation getLocation(Instruction *I) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -1463,6 +1608,7 @@ class BoUpSLP {
     MultiNodeScalars.clear();
     MustGather.clear();
     NonScheduledFirst.clear();
+    CopyableAltOp.clear();
     EntryToLastInstruction.clear();
     LoadEntriesToVectorize.clear();
     IsGraphTransformMode = false;
@@ -2461,8 +2607,16 @@ class BoUpSLP {
           }
           bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
-          OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
-                                 APO, false};
+          Instruction *Inst = cast<Instruction>(VL[Lane]);
+          if (Inst->getOpcode() != MainOp->getOpcode() &&
+              OpIdx > (Inst->getNumOperands() - 1)) {
+            OpsVec[OpIdx][Lane] = {
+                PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
+                false};
+          } else {
+            OpsVec[OpIdx][Lane] = {
+                cast<Instruction>(VL[Lane])->getOperand(OpIdx), APO, false};
+          }
         }
       }
     }
@@ -3298,6 +3452,7 @@ class BoUpSLP {
                          ///< complex node like select/cmp to minmax, mul/add to
                          ///< fma, etc. Must be used for the following nodes in
                          ///< the pattern, not the very first one.
+      CopyableVectorize, ///< The node for copyable elements.
     };
     EntryState State;
 
@@ -3357,7 +3512,8 @@ class BoUpSLP {
       if (Operands.size() < OpIdx + 1)
         Operands.resize(OpIdx + 1);
       assert(Operands[OpIdx].empty() && "Already resized?");
-      assert(OpVL.size() <= Scalars.size() &&
+      assert((State == TreeEntry::CopyableVectorize ||
+              OpVL.size() <= Scalars.size()) &&
              "Number of operands is greater than the number of scalars.");
       Operands[OpIdx].resize(OpVL.size());
       copy(OpVL, Operands[OpIdx].begin());
@@ -3401,7 +3557,9 @@ class BoUpSLP {
     }
 
     /// Some of the instructions in the list have alternate opcodes.
-    bool isAltShuffle() const { return S.isAltShuffle(); }
+    bool isAltShuffle() const {
+      return S.isAltShuffle() && State != TreeEntry::CopyableVectorize;
+    }
 
     bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
 
@@ -3524,6 +3682,9 @@ class BoUpSLP {
       case CombinedVectorize:
         dbgs() << "CombinedVectorize\n";
         break;
+      case CopyableVectorize:
+        dbgs() << "CopyableVectorize\n";
+        break;
       }
       if (S) {
         dbgs() << "MainOp: " << *S.getMainOp() << "\n";
@@ -3619,6 +3780,7 @@ class BoUpSLP {
     // for non-power-of-two vectors.
     assert(
         (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
+         EntryState == TreeEntry::CopyableVectorize ||
          ReuseShuffleIndices.empty()) &&
         "Reshuffling scalars not yet supported for nodes with padding");
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
@@ -3642,8 +3804,13 @@ class BoUpSLP {
       Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
     }
     if (!Last->isGather()) {
-      for (Value *V : VL) {
+      unsigned Opcode = S.getOpcode();
+      for (unsigned i = 0; i < VL.size(); ++i) {
+        Value *V = VL[i];
         const TreeEntry *TE = getTreeEntry(V);
+        Instruction *I = dyn_cast<Instruction>(V);
+        bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
+
         assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
                "Scalar already in tree!");
         if (TE) {
@@ -3651,6 +3818,10 @@ class BoUpSLP {
             MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
           continue;
         }
+        if (EntryState == TreeEntry::CopyableVectorize && IsAltInst) {
+          CopyableAltOp.insert(V);
+          continue;
+        }
         ScalarToTreeEntry[V] = Last;
       }
       // Update the scheduler bundle to point to this TreeEntry.
@@ -3725,6 +3896,10 @@ class BoUpSLP {
   bool areAltOperandsProfitable(const InstructionsState &S,
                                 ArrayRef<Value *> VL) const;
 
+  /// Check that we can represent operations as copyable with looking to
+  /// operations operands.
+  bool canRepresentAsCopyable(const InstructionsState &S, ArrayRef<Value *> VL);
+
   /// Checks if the specified list of the instructions/values can be vectorized
   /// and fills required data before actual scheduling of the instructions.
   TreeEntry::EntryState
@@ -3746,6 +3921,9 @@ class BoUpSLP {
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
+  /// A set op scalars that we are considoring as copyable operations.
+  ValueSet CopyableAltOp;
+
   /// A set of first non-schedulable values.
   ValueSet NonScheduledFirst;
 
@@ -3875,15 +4053,16 @@ class BoUpSLP {
 
     ScheduleData() = default;
 
-    void init(int BlockSchedulingRegionID, Instruction *I) {
+    void init(int BlockSchedulingRegionID, Value *OpVal) {
       FirstInBundle = this;
       NextInBundle = nullptr;
       NextLoadStore = nullptr;
       IsScheduled = false;
       SchedulingRegionID = BlockSchedulingRegionID;
       clearDependencies();
-      Inst = I;
+      OpValue = OpVal;
       TE = nullptr;
+      IsCopy = false;
     }
 
     /// Verify basic self consistency properties
@@ -3990,6 +4169,9 @@ class BoUpSLP {
 
     Instruction *Inst = nullptr;
 
+    /// Opcode of the current instruction in the schedule data.
+    Value *OpValue = nullptr;
+
     /// The TreeEntry that this instruction corresponds to.
     TreeEntry *TE = nullptr;
 
@@ -4037,6 +4219,9 @@ class BoUpSLP {
     /// True if this instruction is scheduled (or considered as scheduled in the
     /// dry-run).
     bool IsScheduled = false;
+
+    /// True if this instruction is copy.
+    bool IsCopy = false;
   };
 
 #ifndef NDEBUG
@@ -4106,6 +4291,31 @@ class BoUpSLP {
       return nullptr;
     }
 
+    ScheduleData *getScheduleData(Value *V, Value *Key) {
+      auto I = ExtraScheduleDataMap.find(V);
+      if (I != ExtraScheduleDataMap.end()) {
+        ScheduleData *SD = I->second.lookup(Key);
+        if (SD && isInSchedulingRegion(SD))
+          return SD;
+      }
+      if (V == Key)
+        return getScheduleData(V);
+      return nullptr;
+    }
+
+    ScheduleData *getScheduleData(Value *V, const TreeEntry *E) {
+      ScheduleData *SD = getScheduleData(V);
+      if (SD && isInSchedulingRegion(SD) && SD->TE == E)
+        return SD;
+      auto I = ExtraScheduleDataMap.find(V);
+      if (I == ExtraScheduleDataMap.end())
+        return nullptr;
+      for (auto &P : I->second)
+        if (isInSchedulingRegion(P.second) && P.second->TE == E)
+          return P.second;
+      return nullptr;
+    }
+
     bool isInSchedulingRegion(ScheduleData *SD) const {
       return SD->SchedulingRegionID == SchedulingRegionID;
     }
@@ -4119,30 +4329,33 @@ class BoUpSLP {
 
       for (ScheduleData *BundleMember = SD; BundleMember;
            BundleMember = BundleMember->NextInBundle) {
-
         // Handle the def-use chain dependencies.
 
         // Decrement the unscheduled counter and insert to ready list if ready.
-        auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
-          ScheduleData *OpDef = getScheduleData(I);
-          if (OpDef && OpDef->hasValidDependencies() &&
-              OpDef->incrementUnscheduledDeps(-1) == 0) {
-            // There are no more unscheduled dependencies after
-            // decrementing, so we can put the dependent instruction
-            // into the ready list.
-            ScheduleData *DepBundle = OpDef->FirstInBundle;
-            assert(!DepBundle->IsScheduled &&
-                   "already scheduled bundle gets ready");
-            ReadyList.insert(DepBundle);
-            LLVM_DEBUG(dbgs()
-                       << "SLP:    gets ready (def): " << *DepBundle << "\n");
-          }
+        auto &&DecrUnsched = [this, &ReadyList, &BundleMember](Instruction *I) {
+          doForAllOpcodes(I, [&ReadyList, &BundleMember,
+                              &I](ScheduleData *OpDef) {
+            if (OpDef && OpDef->hasValidDependencies() &&
+                BundleMember->Inst != I &&
+                OpDef->incrementUnscheduledDeps(-1) == 0) {
+              // There are no more unscheduled dependencies after
+              // decrementing, so we can put the dependent instruction
+              // into the ready list.
+              ScheduleData *DepBundle = OpDef->FirstInBundle;
+              assert(!DepBundle->IsScheduled &&
+                     "already scheduled bundle gets ready");
+              ReadyList.insert(DepBundle);
+              LLVM_DEBUG(dbgs()
+                         << "SLP:    gets ready (def): " << *DepBundle << "\n");
+            }
+          });
         };
 
         // If BundleMember is a vector bundle, its operands may have been
         // reordered during buildTree(). We therefore need to get its operands
         // through the TreeEntry.
-        if (TreeEntry *TE = BundleMember->TE) {
+        if (TreeEntry *TE = BundleMember->TE;
+            TE && TE->State != TreeEntry::CopyableVectorize) {
           // Need to search for the lane since the tree entry can be reordered.
           auto *In = BundleMember->Inst;
           int Lane = std::distance(TE->Scalars.begin(),
@@ -4158,6 +4371,7 @@ class BoUpSLP {
           assert(
               In &&
               (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
+               BundleMember->IsCopy ||
                In->getNumOperands() == TE->getNumOperands()) &&
               "Missed TreeEntry operands?");
 
@@ -4218,7 +4432,8 @@ class BoUpSLP {
                "primary schedule data not in window?");
         assert(isInSchedulingRegion(SD->FirstInBundle) &&
                "entire bundle in window!");
-        SD->verify();
+        (void)SD;
+        doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
       }
 
       for (auto *SD : ReadyInsts) {
@@ -4228,35 +4443,49 @@ class BoUpSLP {
       }
     }
 
+    void doForAllOpcodes(Value *V,
+                         function_ref<void(ScheduleData *SD)> Action) {
+      auto I = ExtraScheduleDataMap.find(V);
+      if (I != ExtraScheduleDataMap.end())
+        for (auto &P : I->second)
+          if (isInSchedulingRegion(P.second))
+            Action(P.second);
+      if (ScheduleData *SD = getScheduleData(V))
+        Action(SD);
+    }
+
     /// Put all instructions into the ReadyList which are ready for scheduling.
     template <typename ReadyListType>
     void initialFillReadyList(ReadyListType &ReadyList) {
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
-        ScheduleData *SD = getScheduleData(I);
-        if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
-            SD->isReady()) {
-          ReadyList.insert(SD);
-          LLVM_DEBUG(dbgs()
-                     << "SLP:    initially in ready list: " << *SD << "\n");
-        }
+        doForAllOpcodes(I, [&](ScheduleData *SD) {
+          if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
+              SD->isReady()) {
+            ReadyList.insert(SD);
+            LLVM_DEBUG(dbgs()
+                       << "SLP:    initially in ready list: " << *SD << "\n");
+          }
+        });
       }
     }
 
     /// Build a bundle from the ScheduleData nodes corresponding to the
     /// scalar instruction for each lane.
-    ScheduleData *buildBundle(ArrayRef<Value *> VL);
+    ScheduleData *buildBundle(ArrayRef<Value *> VL, const InstructionsState &S,
+                              bool IsCopyable, bool &ReSchedule);
 
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
     /// \returns the scheduling bundle. The returned Optional value is not
     /// std::nullopt if \p VL is allowed to be scheduled.
-    std::optional<ScheduleData *>
-    tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
-                      const InstructionsState &S);
+    std::optional<ScheduleData *> tryScheduleBundle(ArrayRef<Value *> VL,
+                                                    BoUpSLP *SLP,
+                                                    const InstructionsState &S,
+                                                    bool IsCopyable);
 
     /// Un-bundles a group of instructions.
-    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+    void cancelScheduling(ArrayRef<Value *> VL, ScheduleData *Bundle);
 
     /// Allocates schedule data chunk.
     ScheduleData *allocateScheduleDataChunks();
@@ -4296,6 +4525,10 @@ class BoUpSLP {
     /// ScheduleData structures are recycled.
     DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
 
+    /// Attaches ScheduleData to Instruction with the leading key.
+    DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
+        ExtraScheduleDataMap;
+
     /// The ready-list for scheduling (only used for the dry-run).
     SetVector<ScheduleData *> ReadyInsts;
 
@@ -7490,6 +7723,57 @@ static bool isAlternateInstruction(const Instruction *I,
                                    const Instruction *AltOp,
                                    const TargetLibraryInfo &TLI);
 
+bool BoUpSLP::canRepresentAsCopyable(const InstructionsState &S,
+                                     ArrayRef<Value *> VL) {
+  unsigned Opcode0 = S.getOpcode();
+  unsigned Opcode1 = S.getAltOpcode();
+  DenseMap<unsigned, unsigned> AltOps;
+  SmallVector<unsigned> MainAltOps;
+  unsigned Operand;
+
+  if (!checkCopyableInnerDep(VL, S))
+    return false;
+  if (isCopyableOp(VL, S.getMainOp(), S.getAltOp()))
+    return true;
+  if ((isa<BinaryOperator>(S.getMainOp()) && Opcode0 == Opcode1) ||
+      (!isValidForAlternation(Opcode0) || !isValidForAlternation(Opcode1)) ||
+      !tryToRepresentAsInstArg(S.getOpcode(), S.getAltOp()) ||
+      !tryToRepresentAsInstArg(S.getAltOpcode(), S.getMainOp()))
+    return false;
+  for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+    Instruction *Inst = dyn_cast<Instruction>(VL[I]);
+    if (!Inst)
+      return false;
+    if (Inst->getOpcode() == Opcode0) {
+      for (unsigned Op : seq<unsigned>(0, S.getMainOp()->getNumOperands())) {
+        Instruction *Inst1 = dyn_cast<Instruction>(Inst->getOperand(Op));
+        if (!Inst1)
+          continue;
+        if (Inst1->getOpcode() == Opcode0)
+          return false;
+        if (Inst1->isBinaryOp() && !isa<ConstantInt>(Inst1->getOperand(1)))
+          return false;
+        if (AltOps.contains(I) ||
+            (AltOps.size() && Op != Operand && !Inst1->isCommutative()))
+          return false;
+        if (Inst1->getOpcode() == Opcode1) {
+          if (Inst1->isBinaryOp() && !isa<ConstantInt>(Inst1->getOperand(1)))
+            return false;
+          if (!AltOps.size())
+            Operand = Op;
+          AltOps[I] = Op;
+        }
+      }
+    } else if (Inst->getOpcode() == Opcode1) {
+      MainAltOps.push_back(I);
+    }
+  }
+  if (AltOps.size() > 0 && MainAltOps.size() > 0)
+    return true;
+
+  return false;
+}
+
 bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
                                        ArrayRef<Value *> VL) const {
   unsigned Opcode0 = S.getOpcode();
@@ -7500,6 +7784,8 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
                            Opcode0, Opcode1, OpcodeMask))
     return true;
   SmallVector<ValueList> Operands;
+  if (S.getMainOp()->getNumOperands() != S.getAltOp()->getNumOperands())
+    return false;
   for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
     Operands.emplace_back();
     // Prepare the operand vector.
@@ -7947,6 +8233,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     return TreeEntry::Vectorize;
   }
   case Instruction::ShuffleVector: {
+    if (VectorizeCopyable && isCopyableOp(VL, S.getMainOp(), S.getAltOp()) &&
+        checkCopyableInnerDep(VL, S))
+      return TreeEntry::CopyableVectorize;
     if (!S.isAltShuffle()) {
       // REVEC can support non alternate shuffle.
       if (SLPReVec && getShufflevectorNumGroups(VL))
@@ -7964,6 +8253,14 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
       return TreeEntry::NeedToGather;
     }
 
+    if (VectorizeCopyable) {
+      if (canRepresentAsCopyable(S, VL))
+        return TreeEntry::CopyableVectorize;
+
+      if (!tryToRepresentAsInstArg(S.getOpcode(), S.getAltOp()) ||
+          !tryToRepresentAsInstArg(S.getAltOpcode(), S.getMainOp()))
+        return TreeEntry::NeedToGather;
+    }
     return TreeEntry::Vectorize;
   }
   default:
@@ -8258,6 +8555,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       return false;
     if (Depth >= RecursionMaxDepth - 1)
       return true;
+
     // Check if all operands are extracts, part of vector node or can build a
     // regular vectorize node.
     SmallVector<unsigned, 8> InstsCount;
@@ -8278,6 +8576,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
     auto *I1 = cast<Instruction>(VL.front());
     auto *I2 = cast<Instruction>(VL.back());
+    if (I1->getNumOperands() != I2->getNumOperands())
+      return true;
     for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
       Candidates.emplace_back().emplace_back(I1->getOperand(Op),
                                              I2->getOperand(Op));
@@ -8418,7 +8718,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   SmallVector<Value *> PointerOps;
   TreeEntry::EntryState State = getScalarsVectorizationState(
       S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
-  if (State == TreeEntry::NeedToGather) {
+  if (State == TreeEntry::NeedToGather ||
+      (State == TreeEntry::CopyableVectorize &&
+       !has_single_bit(UniqueValues.size()))) {
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
                  ReuseShuffleIndices);
     return;
@@ -8429,18 +8731,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     BSRef = std::make_unique<BlockScheduling>(BB);
 
   BlockScheduling &BS = *BSRef;
-
-  std::optional<ScheduleData *> Bundle =
-      BS.tryScheduleBundle(UniqueValues, this, S);
+  std::optional<ScheduleData *> Bundle;
+  Bundle = BS.tryScheduleBundle(UniqueValues, this, S,
+                                State == TreeEntry::CopyableVectorize);
 #ifdef EXPENSIVE_CHECKS
   // Make sure we didn't break any internal invariants
   BS.verify();
 #endif
-  if (!Bundle) {
+  if (!Bundle || (State == TreeEntry::CopyableVectorize && !Bundle.value())) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
-    assert((!BS.getScheduleData(VL0) ||
-            !BS.getScheduleData(VL0)->isPartOfBundle()) &&
-           "tryScheduleBundle should cancelScheduling on failure");
+    assert(
+        (!BS.getScheduleData(VL0) ||
+         !BS.getScheduleData(VL0)->isPartOfBundle() ||
+         State == TreeEntry::CopyableVectorize ||
+         (BS.getScheduleData(VL0)->TE && BS.getScheduleData(VL0)->TE->State ==
+                                             TreeEntry::CopyableVectorize)) &&
+        "tryScheduleBundle should cancelScheduling on failure");
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
                  ReuseShuffleIndices);
     NonScheduledFirst.insert(VL.front());
@@ -8585,6 +8891,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             TE->dump());
         break;
       case TreeEntry::CombinedVectorize:
+      case TreeEntry::CopyableVectorize:
       case TreeEntry::NeedToGather:
         llvm_unreachable("Unexpected loads state.");
       }
@@ -8829,8 +9136,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       return;
     }
     case Instruction::ShuffleVector: {
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndices);
+      TreeEntry *TE =
+          (State != TreeEntry::CopyableVectorize)
+              ? newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                             ReuseShuffleIndices)
+              : newTreeEntry(VL, TreeEntry::CopyableVectorize, Bundle, S,
+                             UserTreeIdx, ReuseShuffleIndices);
       if (S.isAltShuffle()) {
         LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
                    TE->dump());
@@ -8841,6 +9152,79 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             TE->dump());
       }
 
+      if (State == TreeEntry::CopyableVectorize &&
+          !isCopyableOp(VL, S.getMainOp(), S.getAltOp())) {
+        ValueList Left, Right;
+        unsigned Opcode0 = S.getOpcode();
+        unsigned Opcode1 = S.getAltOpcode();
+
+        unsigned Operand;
+        bool IsOperandSet = false;
+        ValueList newMainVL;
+        ValueList newVL;
+
+        for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+          Instruction *Inst = cast<Instruction>(VL[I]);
+          if (Inst->getOpcode() == Opcode0) {
+            newMainVL.push_back(VL[I]);
+            unsigned Op = 0;
+            Instruction *Inst1 = dyn_cast<Instruction>(Inst->getOperand(Op));
+            if (!Inst1) {
+              newVL.push_back(Inst->getOperand(Op));
+              continue;
+            }
+
+            if (IsOperandSet && Op != Operand && !Inst1->isCommutative())
+              return;
+
+            if (Inst1->getOpcode() == Opcode1) {
+              if (!IsOperandSet) {
+                Operand = Op;
+                IsOperandSet = true;
+              }
+            }
+            newVL.push_back(Inst1);
+          } else if (Inst->getOpcode() == Opcode1) {
+            newVL.push_back(Inst);
+          }
+        }
+        VLOperands Ops(VL, S, *this);
+        Left = Ops.getVL(0);
+        Right = Ops.getVL(1);
+        for (unsigned I = 0, VF = VL.size(); I < VF; ++I)
+          if ((cast<Instruction>(VL[I]))->getOpcode() != Opcode0) {
+            Right[I] = ConstantExpr::getBinOpIdentity(
+                Opcode0, Right[0]->getType(), true);
+          }
+
+        TE->setOperand(0, newVL);
+        TE->setOperand(1, Right);
+        buildTree_rec(newVL, Depth + 1, {TE, 0});
+        buildTree_rec(Right, Depth + 1, {TE, 1});
+        return;
+      } else if (State == TreeEntry::CopyableVectorize) {
+        ValueList Left, Right;
+        unsigned Opcode0 = S.getOpcode();
+        VLOperands Ops(VL, S, *this);
+        Left = Ops.getVL(0);
+        Right = Ops.getVL(1);
+        ValueList Left_new, Right_new;
+        for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+          if ((cast<Instruction>(VL[I]))->getOpcode() != Opcode0) {
+            Left_new.push_back(VL[I]);
+            Right_new.push_back(ConstantExpr::getBinOpIdentity(
+                Opcode0, S.getMainOp()->getType(), true));
+          } else {
+            Left_new.push_back(Left[I]);
+            Right_new.push_back(Right[I]);
+          }
+        }
+        TE->setOperand(0, Left_new);
+        TE->setOperand(1, Right_new);
+        buildTree_rec(Left_new, Depth + 1, {TE, 0});
+        buildTree_rec(Right_new, Depth + 1, {TE, 1});
+        return;
+      }
       // Reorder operands if reordering would enable vectorization.
       auto *CI = dyn_cast<CmpInst>(VL0);
       if (CI && any_of(VL, [](Value *V) {
@@ -11147,7 +11531,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
   assert((E->State == TreeEntry::Vectorize ||
           E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::StridedVectorize) &&
+          E->State == TreeEntry::StridedVectorize ||
+          E->State == TreeEntry::CopyableVectorize) &&
          "Unhandled state");
   assert(E->getOpcode() &&
          ((allSameType(VL) && allSameBlock(VL)) ||
@@ -11156,7 +11541,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
          "Invalid VL");
   Instruction *VL0 = E->getMainOp();
   unsigned ShuffleOrOp =
-      E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+      (E->isAltShuffle() && E->State != TreeEntry::CopyableVectorize)
+          ? (unsigned)Instruction::ShuffleVector
+          : E->getOpcode();
   if (E->CombinedOp != TreeEntry::NotCombinedOp)
     ShuffleOrOp = E->CombinedOp;
   SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
@@ -11237,7 +11624,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   // Negative value means vectorizing is profitable.
   auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
     assert((E->State == TreeEntry::Vectorize ||
-            E->State == TreeEntry::StridedVectorize) &&
+            E->State == TreeEntry::StridedVectorize ||
+            E->State == TreeEntry::CopyableVectorize) &&
            "Entry state expected to be Vectorize or StridedVectorize here.");
     InstructionCost ScalarCost = 0;
     InstructionCost VecCost = 0;
@@ -11669,6 +12057,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     auto GetVectorCost = [&](InstructionCost CommonCost) {
       InstructionCost VecLdCost;
       switch (E->State) {
+      case TreeEntry::CopyableVectorize:
       case TreeEntry::Vectorize:
         if (unsigned Factor = E->getInterleaveFactor()) {
           VecLdCost = TTI->getInterleavedMemoryOpCost(
@@ -11794,7 +12183,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                Instruction::isBinaryOp(E->getAltOpcode())) ||
               (Instruction::isCast(E->getOpcode()) &&
                Instruction::isCast(E->getAltOpcode())) ||
-              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
+              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp())) ||
+              E->State == TreeEntry::CopyableVectorize) &&
              "Invalid Shuffle Vector Operand");
     // Try to find the previous shuffle node with the same operands and same
     // main/alternate ops.
@@ -12550,6 +12940,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
         std::optional<unsigned> InsertIdx = getElementIndex(VU);
         if (InsertIdx) {
           const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
+          if (!ScalarTE && CopyableAltOp.contains(EU.Scalar))
+            continue;
           auto *It = find_if(
               ShuffledInserts,
               [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
@@ -12632,8 +13024,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
                                   EU.Lane, EU.Scalar, ScalarUserAndIdx);
     }
     // Leave the scalar instructions as is if they are cheaper than extracts.
-    if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
-        Entry->getOpcode() == Instruction::Load) {
+    if (Entry &&
+        (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
+         Entry->getOpcode() == Instruction::Load)) {
       // Checks if the user of the external scalar is phi in loop body.
       auto IsPhiInLoop = [&](const ExternalUser &U) {
         if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
@@ -13876,13 +14269,25 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
   // VL.back() and iterate over schedule data until we reach the end of the
   // bundle. The end of the bundle is marked by null ScheduleData.
   if (BlocksSchedules.count(BB) && !E->isGather()) {
-    Value *V = E->isOneOf(E->Scalars.back());
+    Value *V = E->getMainOp();
     if (doesNotNeedToBeScheduled(V))
       V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
-    auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
+    auto *Bundle = BlocksSchedules[BB]->getScheduleData(V, E);
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
         Res = Bundle->Inst;
+    // Somehow we could not reply on the SLP scedualar for copyable operations,
+    // because there might be inner dependencies that we could not schedule
+    // correctly.
+    if (E->State == TreeEntry::CopyableVectorize) {
+      for (Value *V : E->Scalars) {
+        if (!isa<Instruction>(V))
+          continue;
+        Instruction *Inst = cast<Instruction>(V);
+        if (Res->comesBefore(Inst))
+          Res = Inst;
+      }
+    }
   }
 
   // LastInst can still be null at this point if there's either not an entry
@@ -14632,6 +15037,21 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
                                                          unsigned NodeIdx) {
   ArrayRef<Value *> VL = E->getOperand(NodeIdx);
   InstructionsState S = getSameOpcode(VL, *TLI);
+  if (E->State == TreeEntry::CopyableVectorize) {
+    unsigned Opcode = E->getMainOp()->getOpcode();
+    for (Value *V : VL) {
+      Instruction *I = dyn_cast<Instruction>(V);
+      if (!I)
+        continue;
+      if (I->getOpcode() == Opcode) {
+        TreeEntry *VE = getTreeEntry(V);
+        if (!VE)
+          return nullptr;
+        if (VE->State == TreeEntry::CopyableVectorize)
+          return VE;
+      }
+    }
+  }
   // Special processing for GEPs bundle, which may include non-gep values.
   if (!S && VL.front()->getType()->isPointerTy()) {
     const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
@@ -14656,6 +15076,10 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
   TreeEntry *VE = getTreeEntry(S.getMainOp());
   if (VE && CheckSameVE(VE))
     return VE;
+  if (!VE || !CheckSameVE(VE))
+    VE = getTreeEntry(S.getAltOp());
+  if (VE && VE->State == TreeEntry::CopyableVectorize)
+    return VE;
   auto It = MultiNodeScalars.find(S.getMainOp());
   if (It != MultiNodeScalars.end()) {
     auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
@@ -16486,6 +16910,8 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
     if (User && !is_contained(Scalar->users(), User))
       continue;
     TreeEntry *E = getTreeEntry(Scalar);
+    if (!E && CopyableAltOp.contains(Scalar))
+      continue;
     assert(E && "Invalid scalar");
     assert(!E->isGather() && "Extracting from a gather list");
     // Non-instruction pointers are not deleted, just skip them.
@@ -16873,6 +17299,9 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
         continue;
       if (isa<PoisonValue>(Scalar))
         continue;
+      if (Entry->State == TreeEntry::CopyableVectorize &&
+          cast<Instruction>(Scalar)->getOpcode() != Entry->getOpcode())
+        continue;
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
@@ -17114,17 +17543,66 @@ void BoUpSLP::optimizeGatherSequence() {
 }
 
 BoUpSLP::ScheduleData *
-BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
+BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL,
+                                      const InstructionsState &S,
+                                      bool IsCopyable, bool &ReSchedule) {
   ScheduleData *Bundle = nullptr;
   ScheduleData *PrevInBundle = nullptr;
+  unsigned Opcode = S.getOpcode();
+  ValueList Keys;
+
   for (Value *V : VL) {
+    auto *SD = getScheduleData(V);
+    bool FoundKey = false;
+    if (SD && !SD->isPartOfBundle()) {
+      Keys.push_back(V);
+      continue;
+    }
+    for (Value *Key : VL) {
+      SD = getScheduleData(V, Key);
+      if (SD && SD->isPartOfBundle()) {
+        ReSchedule = true;
+      } else if (!SD || !SD->isPartOfBundle()) {
+        FoundKey = true;
+        Keys.push_back(Key);
+        break;
+      }
+    }
+    if (!FoundKey) {
+      for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E;
+           ++It) {
+        Value *Key = &*It;
+        if (!Key)
+          continue;
+        SD = getScheduleData(V, Key);
+        if (!SD || !SD->isPartOfBundle()) {
+          FoundKey = true;
+          Keys.push_back(Key);
+          break;
+        }
+      }
+    }
+  }
+
+  for (auto [V, Key] : zip(VL, Keys)) {
     if (doesNotNeedToBeScheduled(V))
       continue;
-    ScheduleData *BundleMember = getScheduleData(V);
+    Instruction *I = dyn_cast<Instruction>(V);
+    bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
+
+    ScheduleData *BundleMember = getScheduleData(V, Key);
+    if (V != Key) {
+      ScheduleData *SD = allocateScheduleDataChunks();
+      Instruction *I = dyn_cast<Instruction>(V);
+      SD->Inst = I;
+      SD->init(SchedulingRegionID, Key);
+      ExtraScheduleDataMap[I][Key] = SD;
+      BundleMember = getScheduleData(V, Key);
+    }
     assert(BundleMember &&
            "no ScheduleData for bundle member "
            "(maybe not in same basic block)");
-    assert(BundleMember->isSchedulingEntity() &&
+    assert(BundleMember->isSchedulingEntity() ||
            "bundle member already part of other bundle");
     if (PrevInBundle) {
       PrevInBundle->NextInBundle = BundleMember;
@@ -17134,6 +17612,8 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
 
     // Group the instructions to a bundle.
     BundleMember->FirstInBundle = Bundle;
+    if (IsCopyable && IsAltInst)
+      BundleMember->IsCopy = true;
     PrevInBundle = BundleMember;
   }
   assert(Bundle && "Failed to find schedule bundle");
@@ -17144,7 +17624,9 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
 // and schedules instructions until the bundle gets ready.
 std::optional<BoUpSLP::ScheduleData *>
 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
-                                            const InstructionsState &S) {
+                                            const InstructionsState &S,
+                                            bool IsCopyable) {
+  bool AnyCopyable = false;
   // No need to schedule PHIs, insertelement, extractelement and extractvalue
   // instructions.
   if (isa<PHINode>(S.getMainOp()) ||
@@ -17155,8 +17637,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   Instruction *OldScheduleEnd = ScheduleEnd;
   LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.getMainOp() << "\n");
 
-  auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
-                                                         ScheduleData *Bundle) {
+  auto TryScheduleBundleImpl = [this, OldScheduleEnd, &AnyCopyable,
+                                SLP](bool ReSchedule, ScheduleData *Bundle) {
     // The scheduling region got new instructions at the lower end (or it is a
     // new region for the first bundle). This makes it necessary to
     // recalculate all dependencies.
@@ -17164,8 +17646,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     // initial bundle to the region.
     if (ScheduleEnd != OldScheduleEnd) {
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
-        if (ScheduleData *SD = getScheduleData(I))
-          SD->clearDependencies();
+        doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
       ReSchedule = true;
     }
     if (Bundle) {
@@ -17186,6 +17667,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
            !ReadyInsts.empty()) {
       ScheduleData *Picked = ReadyInsts.pop_back_val();
+      if (Picked->TE && Picked->TE->State == TreeEntry::CopyableVectorize)
+        AnyCopyable = true;
       assert(Picked->isSchedulingEntity() && Picked->isReady() &&
              "must be ready to schedule");
       schedule(Picked, ReadyInsts);
@@ -17231,24 +17714,35 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     ReSchedule = true;
   }
 
-  auto *Bundle = buildBundle(VL);
+  auto *Bundle = buildBundle(VL, S, IsCopyable, ReSchedule);
+  if (!Bundle)
+    return std::nullopt;
   TryScheduleBundleImpl(ReSchedule, Bundle);
   if (!Bundle->isReady()) {
-    cancelScheduling(VL, S.getMainOp());
+    cancelScheduling(VL, Bundle);
+    // In case we have any copyable element then we have to clear
+    // all dependencies, since all values were calculated for
+    // the vectorized bundle.
+    if (AnyCopyable) {
+      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+        doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
+      }
+      resetSchedule();
+    }
     return std::nullopt;
   }
   return Bundle;
 }
 
 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
-                                                Value *OpValue) {
-  if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
-      doesNotNeedToSchedule(VL))
+                                                ScheduleData *Bundle) {
+  if (isa<PHINode>(VL.front()) || isVectorLikeInstWithConstOps(VL.front()) ||
+      doesNotNeedToSchedule(VL) || !Bundle)
     return;
 
-  if (doesNotNeedToBeScheduled(OpValue))
-    OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
-  ScheduleData *Bundle = getScheduleData(OpValue);
+  if (Bundle->FirstInBundle)
+    Bundle = Bundle->FirstInBundle;
+
   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
@@ -17271,6 +17765,13 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
     if (BundleMember->unscheduledDepsInBundle() == 0) {
       ReadyInsts.insert(BundleMember);
     }
+    auto I = ExtraScheduleDataMap.find(BundleMember->Inst);
+    if (I != ExtraScheduleDataMap.end()) {
+      for (auto &SD : I->second) {
+        if (SD.second == BundleMember)
+          ExtraScheduleDataMap[BundleMember->Inst].erase(SD.first);
+      }
+    }
     BundleMember = Next;
   }
 }
@@ -17286,19 +17787,34 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
 
 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
     Value *V, const InstructionsState &S) {
+  if (getScheduleData(V, S.getMainOp()))
+    return true;
   Instruction *I = dyn_cast<Instruction>(V);
   assert(I && "bundle member must be an instruction");
   assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
          !doesNotNeedToBeScheduled(I) &&
          "phi nodes/insertelements/extractelements/extractvalues don't need to "
          "be scheduled");
-  if (getScheduleData(I))
+  auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
+    ScheduleData *ISD = getScheduleData(I);
+    if (!ISD)
+      return false;
+    assert(isInSchedulingRegion(ISD) &&
+           "ScheduleData not in scheduling region");
+    ScheduleData *SD = allocateScheduleDataChunks();
+    SD->Inst = I;
+    SD->init(SchedulingRegionID, S.getMainOp());
+    return true;
+  };
+  if (CheckScheduleForI(I))
     return true;
   if (!ScheduleStart) {
     // It's the first instruction in the new region.
     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
     ScheduleStart = I;
     ScheduleEnd = I->getNextNode();
+    if (isOneOf(S, I) != I)
+      CheckScheduleForI(I);
     assert(ScheduleEnd && "tried to vectorize a terminator?");
     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
     return true;
@@ -17337,6 +17853,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
            "Instruction is in wrong basic block.");
     initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
     ScheduleStart = I;
+    if (isOneOf(S, I) != I)
+      CheckScheduleForI(I);
     LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
                       << "\n");
     return true;
@@ -17349,6 +17867,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
   initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
                    nullptr);
   ScheduleEnd = I->getNextNode();
+  if (isOneOf(S, I) != I)
+    CheckScheduleForI(I);
   assert(ScheduleEnd && "tried to vectorize a terminator?");
   LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
   return true;
@@ -17367,6 +17887,7 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
     if (!SD) {
       SD = allocateScheduleDataChunks();
       ScheduleDataMap[I] = SD;
+      SD->Inst = I;
     }
     assert(!isInSchedulingRegion(SD) &&
            "new ScheduleData already in scheduling region");
@@ -17421,26 +17942,32 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
 
       // Handle def-use chain dependencies.
       for (User *U : BundleMember->Inst->users()) {
-        if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
+        if (auto *I = dyn_cast<Instruction>(U)) {
+          doForAllOpcodes(I, [&](ScheduleData *UseSD) {
+            ScheduleData *DestBundle = UseSD->FirstInBundle;
+            if ((UseSD->IsCopy || BundleMember->IsCopy) &&
+                DestBundle == BundleMember->FirstInBundle)
+              return;
+            BundleMember->Dependencies++;
+            if (!DestBundle->IsScheduled)
+              BundleMember->incrementUnscheduledDeps(1);
+            if (!DestBundle->hasValidDependencies())
+              WorkList.push_back(DestBundle);
+          });
+        }
+      }
+
+      auto MakeControlDependent = [&](Instruction *I) {
+        doForAllOpcodes(I, [&](ScheduleData *DepDest) {
+          assert(DepDest && "must be in schedule window");
+          DepDest->ControlDependencies.push_back(BundleMember);
           BundleMember->Dependencies++;
-          ScheduleData *DestBundle = UseSD->FirstInBundle;
+          ScheduleData *DestBundle = DepDest->FirstInBundle;
           if (!DestBundle->IsScheduled)
             BundleMember->incrementUnscheduledDeps(1);
           if (!DestBundle->hasValidDependencies())
             WorkList.push_back(DestBundle);
-        }
-      }
-
-      auto MakeControlDependent = [&](Instruction *I) {
-        auto *DepDest = getScheduleData(I);
-        assert(DepDest && "must be in schedule window");
-        DepDest->ControlDependencies.push_back(BundleMember);
-        BundleMember->Dependencies++;
-        ScheduleData *DestBundle = DepDest->FirstInBundle;
-        if (!DestBundle->IsScheduled)
-          BundleMember->incrementUnscheduledDeps(1);
-        if (!DestBundle->hasValidDependencies())
-          WorkList.push_back(DestBundle);
+        });
       };
 
       // Any instruction which isn't safe to speculate at the beginning of the
@@ -17576,12 +18103,12 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
   assert(ScheduleStart &&
          "tried to reset schedule on block which has not been scheduled");
   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
-    if (ScheduleData *SD = getScheduleData(I)) {
+    doForAllOpcodes(I, [&](ScheduleData *SD) {
       assert(isInSchedulingRegion(SD) &&
              "ScheduleData not in scheduling region");
       SD->IsScheduled = false;
       SD->resetUnscheduledDeps();
-    }
+    });
   }
   ReadyInsts.clear();
 }
@@ -17616,8 +18143,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   int Idx = 0;
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
-    if (ScheduleData *SD = BS->getScheduleData(I)) {
-      [[maybe_unused]] TreeEntry *SDTE = getTreeEntry(SD->Inst);
+    BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
+      [[maybe_unused]] TreeEntry *SDTE = SD->TE;
       assert((isVectorLikeInstWithConstOps(SD->Inst) ||
               SD->isPartOfBundle() ==
                   (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
@@ -17626,7 +18153,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 
       if (SD->isSchedulingEntity() && SD->isPartOfBundle())
         BS->calculateDependencies(SD, false, this);
-    }
+    });
   }
   BS->initialFillReadyList(ReadyInsts);
 
@@ -17642,7 +18169,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
     for (ScheduleData *BundleMember = Picked; BundleMember;
          BundleMember = BundleMember->NextInBundle) {
       Instruction *PickedInst = BundleMember->Inst;
-      if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
+      if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst &&
+          LastScheduledInst->getPrevNode())
         PickedInst->moveAfter(LastScheduledInst->getPrevNode());
       LastScheduledInst = PickedInst;
     }
@@ -17658,9 +18186,11 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
   // Check that all schedulable entities got scheduled
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
-    ScheduleData *SD = BS->getScheduleData(I);
-    if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
-      assert(SD->IsScheduled && "must be scheduled at this point");
+    BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
+      if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
+        assert(SD->IsScheduled && "must be scheduled at this point");
+      }
+    });
   }
 #endif
 
@@ -17771,6 +18301,9 @@ bool BoUpSLP::collectValuesToDemote(
   if (NodesToKeepBWs.contains(E.Idx))
     return false;
 
+  if (E.State == TreeEntry::CopyableVectorize)
+    return false;
+
   // If the value is not a vectorized instruction in the expression and not used
   // by the insertelement instruction and not used in multiple vector nodes, it
   // cannot be demoted.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 869a9d1aee80e3..4bbff7b513859a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
 ; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -slp-vectorize-copyable=true -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,COPYABLE %s
 
 define void @add0(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @add0(
@@ -60,6 +61,13 @@ define void @add1(ptr noalias %dst, ptr noalias %src) {
 ; POW2-ONLY-NEXT:    store i32 [[ADD9]], ptr [[INCDEC_PTR7]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
+; COPYABLE-LABEL: @add1(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
+; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    ret void
+;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
   %0 = load i32, ptr %src, align 4
@@ -82,21 +90,44 @@ entry:
 }
 
 define void @sub0(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @sub0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @sub0(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; NON-POW2-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; NON-POW2-NEXT:    store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @sub0(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; POW2-ONLY-NEXT:    store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @sub0(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 -1, i32 0, i32 -2, i32 -3>
+; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -180,23 +211,55 @@ entry:
 }
 
 define void @addsub0(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub0(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @addsub0(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; NON-POW2-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; NON-POW2-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; NON-POW2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @addsub0(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store i32 [[SUB]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @addsub0(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 0>
+; COPYABLE-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; COPYABLE-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; COPYABLE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; COPYABLE-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -220,23 +283,55 @@ entry:
 }
 
 define void @addsub1(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @addsub1(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; NON-POW2-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; NON-POW2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; NON-POW2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; NON-POW2-NEXT:    store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
+; NON-POW2-NEXT:    store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @addsub1(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
+; POW2-ONLY-NEXT:    store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @addsub1(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; COPYABLE-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; COPYABLE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; COPYABLE-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT:    [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP4]], <i32 0, i32 -3>
+; COPYABLE-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -260,21 +355,44 @@ entry:
 }
 
 define void @mul(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @mul(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
-; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
-; CHECK-NEXT:    store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @mul(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
+; NON-POW2-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; NON-POW2-NEXT:    store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; NON-POW2-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; NON-POW2-NEXT:    store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @mul(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
+; POW2-ONLY-NEXT:    store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; POW2-ONLY-NEXT:    store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @mul(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[TMP0]], <i32 257, i32 -3, i32 1, i32 -9>
+; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -325,6 +443,13 @@ define void @shl0(ptr noalias %dst, ptr noalias %src) {
 ; POW2-ONLY-NEXT:    store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
+; COPYABLE-LABEL: @shl0(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
+; COPYABLE-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT:    ret void
+;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
   %0 = load i32, ptr %src, align 4
@@ -434,6 +559,22 @@ define void @add1f(ptr noalias %dst, ptr noalias %src) {
 ; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
+; COPYABLE-LABEL: @add1f(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; COPYABLE-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; COPYABLE-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; COPYABLE-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; COPYABLE-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; COPYABLE-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; COPYABLE-NEXT:    ret void
+;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
   %0 = load float, ptr %src, align 4
@@ -554,23 +695,62 @@ entry:
 }
 
 define void @addsub0f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub0f(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @addsub0f(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store float [[SUB]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; NON-POW2-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; NON-POW2-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; NON-POW2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @addsub0f(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store float [[SUB]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @addsub0f(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; COPYABLE-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT:    store float [[SUB]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; COPYABLE-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; COPYABLE-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; COPYABLE-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT:    [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
+; COPYABLE-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; COPYABLE-NEXT:    store float [[SUB5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; COPYABLE-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
+; COPYABLE-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -594,23 +774,62 @@ entry:
 }
 
 define void @addsub1f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub1f(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
-; CHECK-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @addsub1f(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; NON-POW2-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; NON-POW2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; NON-POW2-NEXT:    [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; NON-POW2-NEXT:    store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT:    [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
+; NON-POW2-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @addsub1f(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
+; POW2-ONLY-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @addsub1f(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; COPYABLE-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT:    store float [[SUB]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; COPYABLE-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT:    [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
+; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; COPYABLE-NEXT:    store float [[SUB1]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; COPYABLE-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; COPYABLE-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; COPYABLE-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
+; COPYABLE-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; COPYABLE-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -729,6 +948,22 @@ define void @add1fn(ptr noalias %dst, ptr noalias %src) {
 ; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
+; COPYABLE-LABEL: @add1fn(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; COPYABLE-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; COPYABLE-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; COPYABLE-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; COPYABLE-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; COPYABLE-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; COPYABLE-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; COPYABLE-NEXT:    ret void
+;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
   %0 = load float, ptr %src, align 4
@@ -885,3 +1120,106 @@ entry:
   store float %sub9, ptr %incdec.ptr7, align 4
   ret void
 }
+
+define void @and_lshr(ptr %0, ptr %1, float %2, float %3) {
+; NON-POW2-LABEL: @and_lshr(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP1:%.*]], align 1
+; NON-POW2-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; NON-POW2-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP5]], 2
+; NON-POW2-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP5]], 4
+; NON-POW2-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; NON-POW2-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP6]], i32 1
+; NON-POW2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP7]], i32 2
+; NON-POW2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+; NON-POW2-NEXT:    [[TMP12:%.*]] = and <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; NON-POW2-NEXT:    [[TMP13:%.*]] = lshr <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; NON-POW2-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; NON-POW2-NEXT:    [[TMP15:%.*]] = sitofp <4 x i32> [[TMP14]] to <4 x float>
+; NON-POW2-NEXT:    [[TMP16:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 4
+; NON-POW2-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> poison, float [[TMP3:%.*]], i32 0
+; NON-POW2-NEXT:    [[TMP18:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> poison, <4 x i32> zeroinitializer
+; NON-POW2-NEXT:    [[TMP19:%.*]] = fadd <4 x float> [[TMP16]], [[TMP18]]
+; NON-POW2-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP2:%.*]], i32 0
+; NON-POW2-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <4 x i32> zeroinitializer
+; NON-POW2-NEXT:    [[TMP22:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP21]], <4 x float> [[TMP15]], <4 x float> [[TMP19]])
+; NON-POW2-NEXT:    store <4 x float> [[TMP22]], ptr [[TMP0]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @and_lshr(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP1:%.*]], align 1
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; POW2-ONLY-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP5]], 2
+; POW2-ONLY-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP5]], 4
+; POW2-ONLY-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; POW2-ONLY-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP6]], i32 1
+; POW2-ONLY-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP7]], i32 2
+; POW2-ONLY-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+; POW2-ONLY-NEXT:    [[TMP12:%.*]] = and <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; POW2-ONLY-NEXT:    [[TMP13:%.*]] = lshr <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; POW2-ONLY-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; POW2-ONLY-NEXT:    [[TMP15:%.*]] = sitofp <4 x i32> [[TMP14]] to <4 x float>
+; POW2-ONLY-NEXT:    [[TMP16:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 4
+; POW2-ONLY-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> poison, float [[TMP3:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[TMP18:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> poison, <4 x i32> zeroinitializer
+; POW2-ONLY-NEXT:    [[TMP19:%.*]] = fadd <4 x float> [[TMP16]], [[TMP18]]
+; POW2-ONLY-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP2:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <4 x i32> zeroinitializer
+; POW2-ONLY-NEXT:    [[TMP22:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP21]], <4 x float> [[TMP15]], <4 x float> [[TMP19]])
+; POW2-ONLY-NEXT:    store <4 x float> [[TMP22]], ptr [[TMP0]], align 4
+; POW2-ONLY-NEXT:    ret void
+;
+; COPYABLE-LABEL: @and_lshr(
+; COPYABLE-NEXT:  entry:
+; COPYABLE-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP1:%.*]], align 1
+; COPYABLE-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; COPYABLE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; COPYABLE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
+; COPYABLE-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 0, i32 2, i32 4, i32 6>
+; COPYABLE-NEXT:    [[TMP9:%.*]] = and <4 x i32> [[TMP8]], <i32 3, i32 3, i32 3, i32 -1>
+; COPYABLE-NEXT:    [[TMP10:%.*]] = sitofp <4 x i32> [[TMP9]] to <4 x float>
+; COPYABLE-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 4
+; COPYABLE-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP3:%.*]], i32 0
+; COPYABLE-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> poison, <4 x i32> zeroinitializer
+; COPYABLE-NEXT:    [[TMP14:%.*]] = fadd <4 x float> [[TMP11]], [[TMP13]]
+; COPYABLE-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> poison, float [[TMP2:%.*]], i32 0
+; COPYABLE-NEXT:    [[TMP16:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> poison, <4 x i32> zeroinitializer
+; COPYABLE-NEXT:    [[TMP17:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP16]], <4 x float> [[TMP10]], <4 x float> [[TMP14]])
+; COPYABLE-NEXT:    store <4 x float> [[TMP17]], ptr [[TMP0]], align 4
+; COPYABLE-NEXT:    ret void
+;
+entry:
+  %5 = getelementptr inbounds float, ptr %0, i64 1
+  %6 = getelementptr inbounds float, ptr %0, i64 2
+  %7 = getelementptr inbounds float, ptr %0, i64 3
+  %8 = load i8, ptr %1, align 1
+  %9 = zext i8 %8 to i32
+  %10 = and i32 %9, 3
+  %11 = sitofp i32 %10 to float
+  %12 = lshr i32 %9, 2
+  %13 = and i32 %12, 3
+  %14 = sitofp i32 %13 to float
+  %15 = lshr i32 %9, 4
+  %16 = and i32 %15, 3
+  %17 = sitofp i32 %16 to float
+  %18 = lshr i32 %9, 6
+  %19 = sitofp i32 %18 to float
+  %20 = load float, ptr %0, align 4
+  %21 = fadd float %20, %3
+  %22 = tail call noundef float @llvm.fma.f32(float %2, float %11, float %21)
+  store float %22, ptr %0, align 4
+  %23 = load float, ptr %5, align 4
+  %24 = fadd float %23, %3
+  %25 = tail call noundef float @llvm.fma.f32(float %2, float %14, float %24)
+  store float %25, ptr %5, align 4
+  %26 = load float, ptr %6, align 4
+  %27 = fadd float %26, %3
+  %28 = tail call noundef float @llvm.fma.f32(float %2, float %17, float %27)
+  store float %28, ptr %6, align 4
+  %29 = load float, ptr %7, align 4
+  %30 = fadd float %29, %3
+  %31 = tail call noundef float @llvm.fma.f32(float %2, float %19, float %30)
+  store float %31, ptr %7, align 4
+  ret void
+}

>From 095957cb2575abba91560dcaa6dd6ea660acbd99 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at gmail.com>
Date: Wed, 22 Jan 2025 21:08:26 +0000
Subject: [PATCH 2/2] =?UTF-8?q?This=20change=20resolves=20all=20issues=20w?=
 =?UTF-8?q?ith=20the=20schedular,=20probably=20I=20want=20to=20avoid=20usi?=
 =?UTF-8?q?ng=20SmallVector=20in=20BoUpSLP::scheduleBlock(BlockScheduling?=
 =?UTF-8?q?=20*BS)=20and=20move=20this=20logic=20to=20BS->schedule().=20Th?=
 =?UTF-8?q?is=20change=20also=20resolved=20a=20few=20of=20regressions=20in?=
 =?UTF-8?q?=20SLP=20tests.=20There=20is=20just=20one=20regression=20remain?=
 =?UTF-8?q?ing=20in=C2=A0X86/load-merge.ll=20in=20case=20-slp-vectorize-co?=
 =?UTF-8?q?pyable=20enabled=20by=20default.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 112 +++++++-------
 .../X86/vect_copyable_in_binops.ll            | 146 ++++--------------
 2 files changed, 88 insertions(+), 170 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 47b61496b5e155..5bd8128426b256 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -917,7 +917,8 @@ static bool isValidForAlternation(unsigned Opcode) {
 }
 
 // Check for inner dependencies, we could not support such depenedies if it
-// comes from a main operaion, only from an alternative.
+// comes from a main operaion, only from alternative or for now we ignore
+// alternative operations depenedies to any alternative.
 static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
                                   const InstructionsState &S);
 
@@ -997,12 +998,13 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     return InstructionsState::invalid();
   bool IsBinOp = isa<BinaryOperator>(MainOp);
   bool IsCopyable = false;
-
+  bool MainOpSwapped = false;
   if (MainOp && AltOp && MainOp != AltOp) {
     if (!IsBinOp && isa<BinaryOperator>(AltOp) && !isa<PHINode>(MainOp)) {
       std::swap(MainOp, AltOp);
       std::swap(AltOpcode, Opcode);
       IsBinOp = true;
+      MainOpSwapped = true;
     }
     IsCopyable = VectorizeCopyable && isCopyableOp(VL, MainOp, AltOp);
     if (IsCopyable && isa<CmpInst>(AltOp)) {
@@ -1010,13 +1012,13 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
       Type *Ty1 = AltOp->getOperand(0)->getType();
       if (Ty0 != Ty1)
         return InstructionsState::invalid();
-    }
-    if (!IsCopyable) {
+    } else if (!IsCopyable) {
       MainOp = cast<Instruction>(*It);
       AltOp = MainOp;
       Opcode = MainOp->getOpcode();
       AltOpcode = Opcode;
       IsBinOp = isa<BinaryOperator>(MainOp);
+      MainOpSwapped = false;
     }
   }
   bool IsCastOp = isa<CastInst>(MainOp);
@@ -1056,8 +1058,8 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
       return InstructionsState::invalid();
   }
   bool AnyPoison = InstCnt != VL.size();
-  // Skip MainOp.
-  for (Value *V : iterator_range(It + 1, VL.end())) {
+  // Skip MainOp if not swapped.
+  for (Value *V : iterator_range((MainOpSwapped) ? It : It + 1, VL.end())) {
     auto *I = dyn_cast<Instruction>(V);
     if (!I)
       continue;
@@ -1082,7 +1084,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
       Type *Ty0 = Op0->getType();
       Value *Op1 = I->getOperand(0);
       Type *Ty1 = Op1->getType();
-      if (Ty0 == Ty1) {
+      if (Ty0 == Ty1 || IsCopyable) {
         if (InstOpcode == Opcode || InstOpcode == AltOpcode)
           continue;
         if (Opcode == AltOpcode) {
@@ -1232,6 +1234,7 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
 static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
                                   const InstructionsState &S) {
   SmallSet<Value *, 4> Ops;
+  SmallSet<Value *, 4> AltOps;
   unsigned Opcode = S.getOpcode();
   for (Value *V : VL) {
     auto *I = dyn_cast<Instruction>(V);
@@ -1239,14 +1242,21 @@ static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
       continue;
     if (I->getOpcode() == Opcode)
       Ops.insert(V);
+    else
+      AltOps.insert(V);
   }
   for (Value *V : VL) {
     auto *I = dyn_cast<Instruction>(V);
     if (!I)
       continue;
-    for (Use &U : I->operands()) {
+    for (Use &U : I->operands())
       if (auto *Op = dyn_cast<Instruction>(U.get()); Op && Ops.contains(Op))
         return false;
+    if (I->getOpcode() != Opcode) {
+      for (Use &U : I->operands())
+        if (auto *Op = dyn_cast<Instruction>(U.get());
+            Op && AltOps.contains(Op))
+          return false;
     }
   }
   return true;
@@ -3896,7 +3906,7 @@ class BoUpSLP {
   bool areAltOperandsProfitable(const InstructionsState &S,
                                 ArrayRef<Value *> VL) const;
 
-  /// Check that we can represent operations as copyable with looking to
+  /// Check that we can represent operations as copyable by looking to
   /// operations operands.
   bool canRepresentAsCopyable(const InstructionsState &S, ArrayRef<Value *> VL);
 
@@ -6443,6 +6453,8 @@ bool BoUpSLP::canReorderOperands(
         }))
       continue;
     if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
+      if (TE->State == TreeEntry::CopyableVectorize)
+        return false;
       // Do not reorder if operand node is used by many user nodes.
       if (any_of(TE->UserTreeIndices,
                  [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
@@ -7751,6 +7763,7 @@ bool BoUpSLP::canRepresentAsCopyable(const InstructionsState &S,
           continue;
         if (Inst1->getOpcode() == Opcode0)
           return false;
+        // FIXME: Independent operand number.
         if (Inst1->isBinaryOp() && !isa<ConstantInt>(Inst1->getOperand(1)))
           return false;
         if (AltOps.contains(I) ||
@@ -8252,15 +8265,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
              "the whole alt sequence is not profitable.\n");
       return TreeEntry::NeedToGather;
     }
+    if (VectorizeCopyable && canRepresentAsCopyable(S, VL))
+      return TreeEntry::CopyableVectorize;
 
-    if (VectorizeCopyable) {
-      if (canRepresentAsCopyable(S, VL))
-        return TreeEntry::CopyableVectorize;
-
-      if (!tryToRepresentAsInstArg(S.getOpcode(), S.getAltOp()) ||
-          !tryToRepresentAsInstArg(S.getAltOpcode(), S.getMainOp()))
-        return TreeEntry::NeedToGather;
-    }
     return TreeEntry::Vectorize;
   }
   default:
@@ -8718,6 +8725,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   SmallVector<Value *> PointerOps;
   TreeEntry::EntryState State = getScalarsVectorizationState(
       S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
+  if (State == TreeEntry::CopyableVectorize) {
+    for (Value *V : VL) {
+      Instruction *I = dyn_cast<Instruction>(V);
+      if (!I)
+        continue;
+      if (I->getOpcode() == S.getAltOpcode() && CopyableAltOp.contains(V)) {
+        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndices);
+        return;
+      }
+    }
+  }
   if (State == TreeEntry::NeedToGather ||
       (State == TreeEntry::CopyableVectorize &&
        !has_single_bit(UniqueValues.size()))) {
@@ -9151,18 +9170,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
             TE->dump());
       }
-
       if (State == TreeEntry::CopyableVectorize &&
           !isCopyableOp(VL, S.getMainOp(), S.getAltOp())) {
         ValueList Left, Right;
         unsigned Opcode0 = S.getOpcode();
         unsigned Opcode1 = S.getAltOpcode();
-
         unsigned Operand;
         bool IsOperandSet = false;
         ValueList newMainVL;
         ValueList newVL;
-
         for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
           Instruction *Inst = cast<Instruction>(VL[I]);
           if (Inst->getOpcode() == Opcode0) {
@@ -9173,10 +9189,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
               newVL.push_back(Inst->getOperand(Op));
               continue;
             }
-
             if (IsOperandSet && Op != Operand && !Inst1->isCommutative())
               return;
-
             if (Inst1->getOpcode() == Opcode1) {
               if (!IsOperandSet) {
                 Operand = Op;
@@ -9196,7 +9210,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             Right[I] = ConstantExpr::getBinOpIdentity(
                 Opcode0, Right[0]->getType(), true);
           }
-
         TE->setOperand(0, newVL);
         TE->setOperand(1, Right);
         buildTree_rec(newVL, Depth + 1, {TE, 0});
@@ -14275,19 +14288,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
     auto *Bundle = BlocksSchedules[BB]->getScheduleData(V, E);
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
-        Res = Bundle->Inst;
-    // Somehow we could not reply on the SLP scedualar for copyable operations,
-    // because there might be inner dependencies that we could not schedule
-    // correctly.
-    if (E->State == TreeEntry::CopyableVectorize) {
-      for (Value *V : E->Scalars) {
-        if (!isa<Instruction>(V))
-          continue;
-        Instruction *Inst = cast<Instruction>(V);
-        if (Res->comesBefore(Inst))
-          Res = Inst;
-      }
-    }
+        if (!CopyableAltOp.contains(Bundle->Inst))
+          Res = Bundle->Inst;
   }
 
   // LastInst can still be null at this point if there's either not an entry
@@ -15037,21 +15039,6 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
                                                          unsigned NodeIdx) {
   ArrayRef<Value *> VL = E->getOperand(NodeIdx);
   InstructionsState S = getSameOpcode(VL, *TLI);
-  if (E->State == TreeEntry::CopyableVectorize) {
-    unsigned Opcode = E->getMainOp()->getOpcode();
-    for (Value *V : VL) {
-      Instruction *I = dyn_cast<Instruction>(V);
-      if (!I)
-        continue;
-      if (I->getOpcode() == Opcode) {
-        TreeEntry *VE = getTreeEntry(V);
-        if (!VE)
-          return nullptr;
-        if (VE->State == TreeEntry::CopyableVectorize)
-          return VE;
-      }
-    }
-  }
   // Special processing for GEPs bundle, which may include non-gep values.
   if (!S && VL.front()->getType()->isPointerTy()) {
     const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
@@ -15076,9 +15063,8 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
   TreeEntry *VE = getTreeEntry(S.getMainOp());
   if (VE && CheckSameVE(VE))
     return VE;
-  if (!VE || !CheckSameVE(VE))
-    VE = getTreeEntry(S.getAltOp());
-  if (VE && VE->State == TreeEntry::CopyableVectorize)
+  VE = getTreeEntry(S.getAltOp());
+  if (VE && CheckSameVE(VE))
     return VE;
   auto It = MultiNodeScalars.find(S.getMainOp());
   if (It != MultiNodeScalars.end()) {
@@ -17583,13 +17569,11 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL,
       }
     }
   }
-
   for (auto [V, Key] : zip(VL, Keys)) {
     if (doesNotNeedToBeScheduled(V))
       continue;
     Instruction *I = dyn_cast<Instruction>(V);
     bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
-
     ScheduleData *BundleMember = getScheduleData(V, Key);
     if (V != Key) {
       ScheduleData *SD = allocateScheduleDataChunks();
@@ -17722,7 +17706,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     cancelScheduling(VL, Bundle);
     // In case we have any copyable element then we have to clear
     // all dependencies, since all values were calculated for
-    // the vectorized bundle.
+    // the vectorized bundles with copies.
     if (AnyCopyable) {
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
         doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
@@ -18166,15 +18150,27 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 
     // Move the scheduled instruction(s) to their dedicated places, if not
     // there yet.
+    SmallVector<Instruction *, 2> DepInstrs;
     for (ScheduleData *BundleMember = Picked; BundleMember;
          BundleMember = BundleMember->NextInBundle) {
+      if (BundleMember->TE &&
+          BundleMember->TE->State == TreeEntry::CopyableVectorize &&
+          BundleMember->TE->getAltOpcode() == BundleMember->Inst->getOpcode()) {
+        DepInstrs.push_back(BundleMember->Inst);
+        continue;
+      }
       Instruction *PickedInst = BundleMember->Inst;
       if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst &&
           LastScheduledInst->getPrevNode())
         PickedInst->moveAfter(LastScheduledInst->getPrevNode());
       LastScheduledInst = PickedInst;
     }
-
+    for (Instruction *PickedInst : DepInstrs) {
+      if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst &&
+          LastScheduledInst->getPrevNode())
+        PickedInst->moveAfter(LastScheduledInst->getPrevNode());
+      LastScheduledInst = PickedInst;
+    }
     BS->schedule(Picked, ReadyInsts);
   }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 4bbff7b513859a..57e107d3a30703 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -695,62 +695,23 @@ entry:
 }
 
 define void @addsub0f(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @addsub0f(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store float [[SUB]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; NON-POW2-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; NON-POW2-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; NON-POW2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; NON-POW2-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @addsub0f(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store float [[SUB]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; POW2-ONLY-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; POW2-ONLY-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; POW2-ONLY-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; POW2-ONLY-NEXT:    ret void
-;
-; COPYABLE-LABEL: @addsub0f(
-; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; COPYABLE-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; COPYABLE-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; COPYABLE-NEXT:    store float [[SUB]], ptr [[DST]], align 4
-; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; COPYABLE-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; COPYABLE-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; COPYABLE-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; COPYABLE-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; COPYABLE-NEXT:    [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
-; COPYABLE-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; COPYABLE-NEXT:    store float [[SUB5]], ptr [[INCDEC_PTR3]], align 4
-; COPYABLE-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; COPYABLE-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
-; COPYABLE-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; COPYABLE-NEXT:    ret void
+; CHECK-LABEL: @addsub0f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -774,62 +735,23 @@ entry:
 }
 
 define void @addsub1f(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @addsub1f(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; NON-POW2-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; NON-POW2-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; NON-POW2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; NON-POW2-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; NON-POW2-NEXT:    [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; NON-POW2-NEXT:    store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; NON-POW2-NEXT:    [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
-; NON-POW2-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @addsub1f(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; POW2-ONLY-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; POW2-ONLY-NEXT:    [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
-; POW2-ONLY-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; POW2-ONLY-NEXT:    ret void
-;
-; COPYABLE-LABEL: @addsub1f(
-; COPYABLE-NEXT:  entry:
-; COPYABLE-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; COPYABLE-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; COPYABLE-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; COPYABLE-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; COPYABLE-NEXT:    store float [[SUB]], ptr [[DST]], align 4
-; COPYABLE-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; COPYABLE-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; COPYABLE-NEXT:    [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
-; COPYABLE-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; COPYABLE-NEXT:    store float [[SUB1]], ptr [[INCDEC_PTR1]], align 4
-; COPYABLE-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; COPYABLE-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; COPYABLE-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; COPYABLE-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR3]], align 4
-; COPYABLE-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; COPYABLE-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
-; COPYABLE-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; COPYABLE-NEXT:    ret void
+; CHECK-LABEL: @addsub1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
+; CHECK-NEXT:    store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1



More information about the llvm-commits mailing list