[llvm] [SLP] SLP's copyable elements based upon Main/Alt operations. (PR #124242)
Dinar Temirbulatov via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 11 07:16:40 PDT 2025
https://github.com/dtemirbulatov updated https://github.com/llvm/llvm-project/pull/124242
>From bde8c3adbbfd296e459601113acd112ce786d5e7 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at gmail.com>
Date: Sun, 23 Feb 2025 02:48:12 +0000
Subject: [PATCH 1/3] [SLP] SLP's copyable elements based upon Main/Alt
operations.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 830 +++++++++++++++---
.../X86/vect_copyable_in_binops.ll | 723 ++++++++++++---
2 files changed, 1269 insertions(+), 284 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bf256d82ae17d..5225eb2b2eefa 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -201,6 +201,10 @@ static cl::opt<bool> VectorizeNonPowerOf2(
"slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
cl::desc("Try to vectorize with non-power-of-2 number of elements."));
+static cl::opt<bool>
+ VectorizeCopyable("slp-vectorize-copyable", cl::init(false), cl::Hidden,
+ cl::desc("Try to vectorize with copyable elements."));
+
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
@@ -594,6 +598,40 @@ static std::optional<unsigned> getElementIndex(const Value *Inst,
return Index;
}
+/// Checks if the \p Opcode can be considered as an operand of a (possibly)
+/// binary operation \p I.
+/// \returns The code of the binary operation of instruction \p I if the
+/// instruction with \p Opcode can be considered as an operand of \p I with the
+/// default value.
+static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) {
+ if (Opcode != Instruction::PHI && Opcode != Instruction::Invoke &&
+ (I->getOpcode() == Instruction::Add ||
+ I->getOpcode() == Instruction::And ||
+ I->getOpcode() == Instruction::AShr ||
+ I->getOpcode() == Instruction::BitCast ||
+ I->getOpcode() == Instruction::Call ||
+ // Issue with scheduling with isVectorLikeInstWithConstOps
+ // operations.
+ // I->getOpcode() == Instruction::ExtractElement ||
+ // I->getOpcode() == Instruction::ExtractValue ||
+ I->getOpcode() == Instruction::ICmp ||
+ I->getOpcode() == Instruction::Load ||
+ I->getOpcode() == Instruction::LShr ||
+ I->getOpcode() == Instruction::Mul ||
+ I->getOpcode() == Instruction::Or ||
+ I->getOpcode() == Instruction::PtrToInt ||
+ I->getOpcode() == Instruction::Select ||
+ I->getOpcode() == Instruction::SExt ||
+ I->getOpcode() == Instruction::Shl ||
+ I->getOpcode() == Instruction::Sub ||
+ I->getOpcode() == Instruction::Trunc ||
+ I->getOpcode() == Instruction::Xor ||
+ I->getOpcode() == Instruction::ZExt ||
+ (isa<FPMathOperator>(I) && cast<FPMathOperator>(I)->isFast())))
+ return I->getOpcode();
+ return 0;
+}
+
namespace {
/// Specifies the way the mask should be analyzed for undefs/poisonous elements
/// in the shuffle mask.
@@ -816,6 +854,9 @@ class InstructionsState {
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
+ /// True if alterative operation is copy instruction.
+ bool IsAltOpCopy = false;
+
public:
Instruction *getMainOp() const {
assert(valid() && "InstructionsState is invalid.");
@@ -832,9 +873,13 @@ class InstructionsState {
unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
+ bool isAltOpCopy() const { return IsAltOpCopy; }
+
/// Some of the instructions in the list have alternate opcodes.
bool isAltShuffle() const { return getMainOp() != getAltOp(); }
+ void setAltOpCopy(bool Val) { IsAltOpCopy = Val; }
+
bool isOpcodeOrAlt(Instruction *I) const {
unsigned CheckedOpcode = I->getOpcode();
return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
@@ -853,6 +898,16 @@ class InstructionsState {
} // end anonymous namespace
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(const InstructionsState &S, Value *Op) {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (I && S.isOpcodeOrAlt(I))
+ return Op;
+ return S.getMainOp();
+}
+
/// \returns true if \p Opcode is allowed as part of the main/alternate
/// instruction for SLP vectorization.
///
@@ -865,6 +920,15 @@ static bool isValidForAlternation(unsigned Opcode) {
return true;
}
+// Check for inner dependencies, we could not support such depenedies if it
+// comes from a main operaion, only from alternative or for now we ignore
+// alternative operations depenedies to any alternative.
+static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
+ const InstructionsState &S);
+
+// Determine that the vector could be vectorized with copyable elements.
+static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt);
+
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
const TargetLibraryInfo &TLI);
@@ -917,19 +981,51 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
return InstructionsState::invalid();
Instruction *MainOp = cast<Instruction>(*It);
+ Instruction *AltOp = MainOp;
+ unsigned Opcode = MainOp->getOpcode();
+ unsigned AltOpcode = Opcode;
+ for (Value *V : iterator_range(It + 1, VL.end())) {
+ Instruction *Inst = dyn_cast<Instruction>(V);
+ if (!Inst)
+ continue;
+ unsigned VOpcode = Inst->getOpcode();
+ if (Inst && AltOpcode == Opcode && !isa<PHINode>(Inst) &&
+ VOpcode != Opcode && isValidForAlternation(VOpcode)) {
+ AltOpcode = VOpcode;
+ AltOp = Inst;
+ break;
+ }
+ }
unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
(VL.size() == 2 && InstCnt < 2))
return InstructionsState::invalid();
-
- bool IsCastOp = isa<CastInst>(MainOp);
bool IsBinOp = isa<BinaryOperator>(MainOp);
+ bool IsCopyable = false;
+ if (MainOp && AltOp && MainOp != AltOp) {
+ if (!IsBinOp && isa<BinaryOperator>(AltOp) && !isa<PHINode>(MainOp)) {
+ std::swap(MainOp, AltOp);
+ std::swap(AltOpcode, Opcode);
+ IsBinOp = true;
+ }
+ IsCopyable = VectorizeCopyable && isCopyableOp(VL, MainOp, AltOp);
+ if (IsCopyable && isa<CmpInst>(AltOp)) {
+ Type *Ty0 = MainOp->getOperand(0)->getType();
+ Type *Ty1 = AltOp->getOperand(0)->getType();
+ if (Ty0 != Ty1)
+ return InstructionsState::invalid();
+ } else if (!IsCopyable) {
+ MainOp = cast<Instruction>(*It);
+ AltOp = MainOp;
+ Opcode = MainOp->getOpcode();
+ AltOpcode = Opcode;
+ IsBinOp = isa<BinaryOperator>(MainOp);
+ }
+ }
+ bool IsCastOp = isa<CastInst>(MainOp);
bool IsCmpOp = isa<CmpInst>(MainOp);
CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
: CmpInst::BAD_ICMP_PREDICATE;
- Instruction *AltOp = MainOp;
- unsigned Opcode = MainOp->getOpcode();
- unsigned AltOpcode = Opcode;
bool SwappedPredsCompatible = IsCmpOp && [&]() {
SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
@@ -985,12 +1081,12 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
AltOp = I;
continue;
}
- } else if (IsCastOp && isa<CastInst>(I)) {
+ } else if ((IsCastOp || IsCopyable) && isa<CastInst>(I)) {
Value *Op0 = MainOp->getOperand(0);
Type *Ty0 = Op0->getType();
Value *Op1 = I->getOperand(0);
Type *Ty1 = Op1->getType();
- if (Ty0 == Ty1) {
+ if (Ty0 == Ty1 || IsCopyable) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
if (Opcode == AltOpcode) {
@@ -1002,13 +1098,15 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
continue;
}
}
- } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
+ } else if (auto *Inst = dyn_cast<CmpInst>(I);
+ Inst && (IsCmpOp || IsCopyable)) {
auto *BaseInst = cast<CmpInst>(MainOp);
Type *Ty0 = BaseInst->getOperand(0)->getType();
Type *Ty1 = Inst->getOperand(0)->getType();
if (Ty0 == Ty1) {
- assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
- assert(InstOpcode == AltOpcode &&
+ assert((IsCopyable || InstOpcode == Opcode) &&
+ "Expected same CmpInst opcode.");
+ assert((IsCopyable || InstOpcode == AltOpcode) &&
"Alternate instructions are only supported by BinaryOperator "
"and CastInst.");
// Check for compatible operands. If the corresponding operands are not
@@ -1039,23 +1137,32 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
AltPred == CurrentPred || AltPred == SwappedCurrentPred)
continue;
}
- } else if (InstOpcode == Opcode) {
- assert(InstOpcode == AltOpcode &&
+ } else if (InstOpcode == Opcode ||
+ (IsCopyable && InstOpcode == AltOpcode)) {
+ assert((IsCopyable || InstOpcode == AltOpcode) &&
"Alternate instructions are only supported by BinaryOperator and "
"CastInst.");
+ Instruction *Op = MainOp;
+ if (IsCopyable) {
+ if (InstOpcode != Opcode && InstOpcode != AltOpcode) {
+ Op = I;
+ } else if (Opcode != AltOpcode && InstOpcode == AltOpcode) {
+ Op = AltOp;
+ }
+ }
if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
if (Gep->getNumOperands() != 2 ||
- Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
+ Gep->getOperand(0)->getType() != Op->getOperand(0)->getType())
return InstructionsState::invalid();
} else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
if (!isVectorLikeInstWithConstOps(EI))
return InstructionsState::invalid();
} else if (auto *LI = dyn_cast<LoadInst>(I)) {
- auto *BaseLI = cast<LoadInst>(MainOp);
+ auto *BaseLI = cast<LoadInst>(Op);
if (!LI->isSimple() || !BaseLI->isSimple())
return InstructionsState::invalid();
} else if (auto *Call = dyn_cast<CallInst>(I)) {
- auto *CallBase = cast<CallInst>(MainOp);
+ auto *CallBase = cast<CallInst>(Op);
if (Call->getCalledFunction() != CallBase->getCalledFunction())
return InstructionsState::invalid();
if (Call->hasOperandBundles() &&
@@ -1070,13 +1177,14 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
return InstructionsState::invalid();
if (!ID) {
SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
- if (Mappings.size() != BaseMappings.size() ||
- Mappings.front().ISA != BaseMappings.front().ISA ||
- Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
- Mappings.front().VectorName != BaseMappings.front().VectorName ||
- Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
- Mappings.front().Shape.Parameters !=
- BaseMappings.front().Shape.Parameters)
+ if (Mappings.size() &&
+ (Mappings.size() != BaseMappings.size() ||
+ Mappings.front().ISA != BaseMappings.front().ISA ||
+ Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
+ Mappings.front().VectorName != BaseMappings.front().VectorName ||
+ Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
+ Mappings.front().Shape.Parameters !=
+ BaseMappings.front().Shape.Parameters))
return InstructionsState::invalid();
}
}
@@ -1125,6 +1233,54 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
}
}
+static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
+ const InstructionsState &S) {
+ SmallSet<Value *, 4> Ops;
+ SmallSet<Value *, 4> AltOps;
+ unsigned Opcode = S.getOpcode();
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (I->getOpcode() == Opcode)
+ Ops.insert(V);
+ else
+ AltOps.insert(V);
+ }
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ for (Use &U : I->operands())
+ if (auto *Op = dyn_cast<Instruction>(U.get()); Op && Ops.contains(Op))
+ return false;
+ if (I->getOpcode() != Opcode) {
+ for (Use &U : I->operands())
+ if (auto *Op = dyn_cast<Instruction>(U.get());
+ Op && AltOps.contains(Op))
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) {
+ if (any_of(VL, IsaPred<PoisonValue>) || Main == Alt ||
+ !isa<BinaryOperator>(Main) || !isa<Instruction>(Alt) ||
+ find_if(VL, IsaPred<PHINode>) != VL.end())
+ return false;
+
+ Instruction *MainOp = cast<Instruction>(Main);
+ Instruction *AltOp = cast<Instruction>(Alt);
+
+ if (isa<BinaryOperator>(MainOp) && !isa<BinaryOperator>(AltOp) &&
+ isValidForAlternation(MainOp->getOpcode()) &&
+ isValidForAlternation(AltOp->getOpcode()) &&
+ tryToRepresentAsInstArg(MainOp->getOpcode(), AltOp) &&
+ tryToRepresentAsInstArg(AltOp->getOpcode(), MainOp))
+ return true;
+ return false;
+}
/// \returns the AA location that is being access by the instruction.
static MemoryLocation getLocation(Instruction *I) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -1473,6 +1629,7 @@ class BoUpSLP {
ScalarToTreeEntries.clear();
MustGather.clear();
NonScheduledFirst.clear();
+ CopyableAltOp.clear();
EntryToLastInstruction.clear();
LoadEntriesToVectorize.clear();
IsGraphTransformMode = false;
@@ -2488,8 +2645,16 @@ class BoUpSLP {
}
bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
bool APO = (OpIdx == 0) ? false : IsInverseOperation;
- OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
- APO, false};
+ Instruction *Inst = cast<Instruction>(VL[Lane]);
+ if (Inst->getOpcode() != MainOp->getOpcode() &&
+ OpIdx > (Inst->getNumOperands() - 1)) {
+ OpsVec[OpIdx][Lane] = {
+ PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
+ false};
+ } else {
+ OpsVec[OpIdx][Lane] = {
+ cast<Instruction>(VL[Lane])->getOperand(OpIdx), APO, false};
+ }
}
}
}
@@ -3416,7 +3581,7 @@ class BoUpSLP {
}
/// Some of the instructions in the list have alternate opcodes.
- bool isAltShuffle() const { return S.isAltShuffle(); }
+ bool isAltShuffle() const { return S.isAltShuffle() && !S.isAltOpCopy(); }
bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
@@ -3444,6 +3609,8 @@ class BoUpSLP {
unsigned getAltOpcode() const { return S.getAltOpcode(); }
+ bool isAltOpCopy() const { return S.isAltOpCopy(); }
+
bool hasState() const { return S.valid(); }
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
@@ -3543,6 +3710,7 @@ class BoUpSLP {
if (S) {
dbgs() << "MainOp: " << *S.getMainOp() << "\n";
dbgs() << "AltOp: " << *S.getAltOp() << "\n";
+ dbgs() << "isAltOpCopy: " << S.isAltOpCopy() << "\n";
} else {
dbgs() << "MainOp: NULL\n";
dbgs() << "AltOp: NULL\n";
@@ -3636,7 +3804,7 @@ class BoUpSLP {
// for non-power-of-two vectors.
assert(
(hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
- ReuseShuffleIndices.empty()) &&
+ S.isAltOpCopy() || ReuseShuffleIndices.empty()) &&
"Reshuffling scalars not yet supported for nodes with padding");
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
@@ -3660,10 +3828,18 @@ class BoUpSLP {
}
if (!Last->isGather()) {
SmallPtrSet<Value *, 4> Processed;
- for (Value *V : VL) {
+ unsigned Opcode = S.getOpcode();
+ for (unsigned i = 0; i < VL.size(); ++i) {
+ Value *V = VL[i];
if (isa<PoisonValue>(V))
continue;
auto It = ScalarToTreeEntries.find(V);
+ Instruction *I = dyn_cast<Instruction>(V);
+ bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
+ if (S.isAltOpCopy() && IsAltInst) {
+ CopyableAltOp.insert(V);
+ continue;
+ }
assert(
(It == ScalarToTreeEntries.end() ||
(It->getSecond().size() == 1 && It->getSecond().front() == Last) ||
@@ -3759,13 +3935,15 @@ class BoUpSLP {
bool areAltOperandsProfitable(const InstructionsState &S,
ArrayRef<Value *> VL) const;
+ /// Check that we can represent operations as copyable by looking to
+ /// operations operands.
+ bool canRepresentAsCopyable(const InstructionsState &S, ArrayRef<Value *> VL);
+
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
- TreeEntry::EntryState
- getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
- bool IsScatterVectorizeUserTE,
- OrdersType &CurrentOrder,
- SmallVectorImpl<Value *> &PointerOps);
+ TreeEntry::EntryState getScalarsVectorizationState(
+ InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps);
/// Maps a specific scalar to its tree entry(ies).
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -3776,6 +3954,9 @@ class BoUpSLP {
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
+ /// A set op scalars that we are considoring as copyable operations.
+ ValueSet CopyableAltOp;
+
/// A set of first non-schedulable values.
ValueSet NonScheduledFirst;
@@ -3908,15 +4089,16 @@ class BoUpSLP {
ScheduleData() = default;
- void init(int BlockSchedulingRegionID, Instruction *I) {
+ void init(int BlockSchedulingRegionID, Value *OpVal) {
FirstInBundle = this;
NextInBundle = nullptr;
NextLoadStore = nullptr;
IsScheduled = false;
SchedulingRegionID = BlockSchedulingRegionID;
clearDependencies();
- Inst = I;
+ OpValue = OpVal;
TE = nullptr;
+ IsCopy = false;
}
/// Verify basic self consistency properties
@@ -4029,6 +4211,9 @@ class BoUpSLP {
Instruction *Inst = nullptr;
+ /// Opcode of the current instruction in the schedule data.
+ Value *OpValue = nullptr;
+
/// The TreeEntry that this instruction corresponds to.
TreeEntry *TE = nullptr;
@@ -4076,6 +4261,9 @@ class BoUpSLP {
/// True if this instruction is scheduled (or considered as scheduled in the
/// dry-run).
bool IsScheduled = false;
+
+ /// True if this instruction is a copy.
+ bool IsCopy = false;
};
#ifndef NDEBUG
@@ -4133,15 +4321,28 @@ class BoUpSLP {
if (BB != I->getParent())
// Avoid lookup if can't possibly be in map.
return nullptr;
- ScheduleData *SD = ScheduleDataMap.lookup(I);
- if (SD && isInSchedulingRegion(SD))
- return SD;
+ return getScheduleData(I, I);
+ }
+
+ ScheduleData *getScheduleData(Value *V) { return getScheduleData(V, V); }
+
+ ScheduleData *getScheduleData(Value *V, Value *Key) {
+ auto I = ScheduleDataMap.find(V);
+ if (I != ScheduleDataMap.end()) {
+ ScheduleData *SD = I->second.lookup(Key);
+ if (SD && isInSchedulingRegion(SD))
+ return SD;
+ }
return nullptr;
}
- ScheduleData *getScheduleData(Value *V) {
- if (auto *I = dyn_cast<Instruction>(V))
- return getScheduleData(I);
+ ScheduleData *getScheduleData(Value *V, const TreeEntry *E) {
+ auto I = ScheduleDataMap.find(V);
+ if (I == ScheduleDataMap.end())
+ return nullptr;
+ for (auto &P : I->second)
+ if (isInSchedulingRegion(P.second) && P.second->TE == E)
+ return P.second;
return nullptr;
}
@@ -4158,30 +4359,32 @@ class BoUpSLP {
for (ScheduleData *BundleMember = SD; BundleMember;
BundleMember = BundleMember->NextInBundle) {
-
// Handle the def-use chain dependencies.
// Decrement the unscheduled counter and insert to ready list if ready.
- auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
- ScheduleData *OpDef = getScheduleData(I);
- if (OpDef && OpDef->hasValidDependencies() &&
- OpDef->incrementUnscheduledDeps(-1) == 0) {
- // There are no more unscheduled dependencies after
- // decrementing, so we can put the dependent instruction
- // into the ready list.
- ScheduleData *DepBundle = OpDef->FirstInBundle;
- assert(!DepBundle->IsScheduled &&
- "already scheduled bundle gets ready");
- ReadyList.insert(DepBundle);
- LLVM_DEBUG(dbgs()
- << "SLP: gets ready (def): " << *DepBundle << "\n");
- }
+ auto &&DecrUnsched = [this, &ReadyList, &BundleMember](Instruction *I) {
+ doForAllOpcodes(I, [&ReadyList, &BundleMember,
+ &I](ScheduleData *OpDef) {
+ if (OpDef && OpDef->hasValidDependencies() &&
+ BundleMember->Inst != I &&
+ OpDef->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after
+ // decrementing, so we can put the dependent instruction
+ // into the ready list.
+ ScheduleData *DepBundle = OpDef->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (def): " << *DepBundle << "\n");
+ }
+ });
};
// If BundleMember is a vector bundle, its operands may have been
// reordered during buildTree(). We therefore need to get its operands
// through the TreeEntry.
- if (TreeEntry *TE = BundleMember->TE) {
+ if (TreeEntry *TE = BundleMember->TE; TE && !TE->isAltOpCopy()) {
// Need to search for the lane since the tree entry can be reordered.
auto *In = BundleMember->Inst;
int Lane = std::distance(TE->Scalars.begin(),
@@ -4197,6 +4400,7 @@ class BoUpSLP {
assert(
In &&
(isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
+ BundleMember->IsCopy ||
In->getNumOperands() == TE->getNumOperands()) &&
"Missed TreeEntry operands?");
@@ -4257,7 +4461,8 @@ class BoUpSLP {
"primary schedule data not in window?");
assert(isInSchedulingRegion(SD->FirstInBundle) &&
"entire bundle in window!");
- SD->verify();
+ (void)SD;
+ doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
}
for (auto *SD : ReadyInsts) {
@@ -4267,35 +4472,47 @@ class BoUpSLP {
}
}
+ void doForAllOpcodes(Value *V,
+ function_ref<void(ScheduleData *SD)> Action) {
+ auto I = ScheduleDataMap.find(V);
+ if (I != ScheduleDataMap.end())
+ for (auto &P : I->second)
+ if (isInSchedulingRegion(P.second))
+ Action(P.second);
+ }
+
/// Put all instructions into the ReadyList which are ready for scheduling.
template <typename ReadyListType>
void initialFillReadyList(ReadyListType &ReadyList) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- ScheduleData *SD = getScheduleData(I);
- if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
- SD->isReady()) {
- ReadyList.insert(SD);
- LLVM_DEBUG(dbgs()
- << "SLP: initially in ready list: " << *SD << "\n");
- }
+ doForAllOpcodes(I, [&](ScheduleData *SD) {
+ if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
+ SD->isReady()) {
+ ReadyList.insert(SD);
+ LLVM_DEBUG(dbgs()
+ << "SLP: initially in ready list: " << *SD << "\n");
+ }
+ });
}
}
/// Build a bundle from the ScheduleData nodes corresponding to the
/// scalar instruction for each lane.
- ScheduleData *buildBundle(ArrayRef<Value *> VL);
+ ScheduleData *buildBundle(ArrayRef<Value *> VL, const InstructionsState &S,
+ bool &ReSchedule);
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
/// \returns the scheduling bundle. The returned Optional value is not
/// std::nullopt if \p VL is allowed to be scheduled.
- std::optional<ScheduleData *>
- tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
- const InstructionsState &S);
+ std::optional<ScheduleData *> tryScheduleBundle(ArrayRef<Value *> VL,
+ BoUpSLP *SLP,
+ const InstructionsState &S,
+ bool AnyCopies);
/// Un-bundles a group of instructions.
- void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+ void cancelScheduling(ArrayRef<Value *> VL, ScheduleData *Bundle);
/// Allocates schedule data chunk.
ScheduleData *allocateScheduleDataChunks();
@@ -4333,7 +4550,7 @@ class BoUpSLP {
/// Attaches ScheduleData to Instruction.
/// Note that the mapping survives during all vectorization iterations, i.e.
/// ScheduleData structures are recycled.
- DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
+ DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>> ScheduleDataMap;
/// The ready-list for scheduling (only used for the dry-run).
SetVector<ScheduleData *> ReadyInsts;
@@ -6330,6 +6547,8 @@ bool BoUpSLP::canReorderOperands(
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
+ if (TE->isAltOpCopy())
+ return false;
// Add the node to the list of the ordered nodes with the identity
// order.
Edges.emplace_back(I, TE);
@@ -6732,8 +6951,11 @@ void BoUpSLP::buildExternalUses(
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
- if (!isa<Instruction>(Scalar))
+ if (!isa<Instruction>(Scalar) ||
+ (Entry->isAltOpCopy() &&
+ cast<Instruction>(Scalar)->getOpcode() != Entry->getOpcode()))
continue;
+
// All uses must be replaced already? No need to do it again.
auto It = ScalarToExtUses.find(Scalar);
if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
@@ -7687,6 +7909,52 @@ static bool isAlternateInstruction(const Instruction *I,
const Instruction *AltOp,
const TargetLibraryInfo &TLI);
+bool BoUpSLP::canRepresentAsCopyable(const InstructionsState &S,
+ ArrayRef<Value *> VL) {
+ unsigned Opcode0 = S.getOpcode();
+ unsigned Opcode1 = S.getAltOpcode();
+ DenseMap<unsigned, unsigned> AltOps;
+ SmallVector<unsigned> MainAltOps;
+ unsigned Operand;
+
+ if (!checkCopyableInnerDep(VL, S))
+ return false;
+ if (isCopyableOp(VL, S.getMainOp(), S.getAltOp()))
+ return true;
+ if ((isa<BinaryOperator>(S.getMainOp()) && Opcode0 == Opcode1) ||
+ (!isValidForAlternation(Opcode0) || !isValidForAlternation(Opcode1)) ||
+ !tryToRepresentAsInstArg(S.getOpcode(), S.getAltOp()) ||
+ !tryToRepresentAsInstArg(S.getAltOpcode(), S.getMainOp()))
+ return false;
+ for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+ Instruction *Inst = dyn_cast<Instruction>(VL[I]);
+ if (!Inst)
+ return false;
+ if (Inst->getOpcode() == Opcode0) {
+ for (unsigned Op : seq<unsigned>(0, S.getMainOp()->getNumOperands())) {
+ Instruction *Inst1 = dyn_cast<Instruction>(Inst->getOperand(Op));
+ if (!Inst1)
+ continue;
+ if (Inst1->getOpcode() == Opcode0)
+ return false;
+ if (AltOps.contains(I) || (AltOps.size() && Op != Operand))
+ return false;
+ if (Inst1->getOpcode() == Opcode1) {
+ if (!AltOps.size())
+ Operand = Op;
+ AltOps[I] = Op;
+ }
+ }
+ } else if (Inst->getOpcode() == Opcode1) {
+ MainAltOps.push_back(I);
+ }
+ }
+ if (AltOps.size() > 0 && MainAltOps.size() > 0)
+ return true;
+
+ return false;
+}
+
bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
ArrayRef<Value *> VL) const {
unsigned Opcode0 = S.getOpcode();
@@ -7697,6 +7965,8 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
Opcode0, Opcode1, OpcodeMask))
return true;
SmallVector<ValueList> Operands;
+ if (S.getMainOp()->getNumOperands() != S.getAltOp()->getNumOperands())
+ return false;
for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
Operands.emplace_back();
// Prepare the operand vector.
@@ -7861,9 +8131,8 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
}
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
- const InstructionsState &S, ArrayRef<Value *> VL,
- bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
- SmallVectorImpl<Value *> &PointerOps) {
+ InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) {
assert(S.getMainOp() &&
"Expected instructions with same/alternate opcodes only.");
@@ -8218,6 +8487,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
return TreeEntry::Vectorize;
}
case Instruction::ShuffleVector: {
+ if (VectorizeCopyable && isCopyableOp(VL, S.getMainOp(), S.getAltOp()) &&
+ checkCopyableInnerDep(VL, S)) {
+ S.setAltOpCopy(true);
+ return TreeEntry::Vectorize;
+ }
if (!S.isAltShuffle()) {
// REVEC can support non alternate shuffle.
if (SLPReVec && getShufflevectorNumGroups(VL))
@@ -8234,6 +8508,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
"the whole alt sequence is not profitable.\n");
return TreeEntry::NeedToGather;
}
+ if (VectorizeCopyable && canRepresentAsCopyable(S, VL))
+ S.setAltOpCopy(true);
return TreeEntry::Vectorize;
}
@@ -8516,6 +8792,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
auto *I1 = cast<Instruction>(VL.front());
auto *I2 = cast<Instruction>(VL.back());
+ if (I1->getNumOperands() != I2->getNumOperands())
+ return true;
for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
I2->getOperand(Op));
@@ -8656,7 +8934,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<Value *> PointerOps;
TreeEntry::EntryState State = getScalarsVectorizationState(
S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
- if (State == TreeEntry::NeedToGather) {
+ if (S.isAltOpCopy()) {
+ for (Value *V : VL) {
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (I->getOpcode() == S.getAltOpcode() && CopyableAltOp.contains(V)) {
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices);
+ return;
+ }
+ }
+ }
+ if (State == TreeEntry::NeedToGather ||
+ (S.isAltOpCopy() && !has_single_bit(UniqueValues.size()))) {
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
return;
@@ -8666,18 +8957,25 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (!BSRef)
BSRef = std::make_unique<BlockScheduling>(BB);
- BlockScheduling &BS = *BSRef;
+ bool AnyCopies = false;
+ for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
+ if (VectorizableTree[Id]->isAltOpCopy())
+ AnyCopies = true;
+ }
- std::optional<ScheduleData *> Bundle =
- BS.tryScheduleBundle(UniqueValues, this, S);
+ BlockScheduling &BS = *BSRef;
+ std::optional<ScheduleData *> Bundle;
+ Bundle = BS.tryScheduleBundle(UniqueValues, this, S, AnyCopies);
#ifdef EXPENSIVE_CHECKS
// Make sure we didn't break any internal invariants
BS.verify();
#endif
- if (!Bundle) {
+ if (!Bundle || (S.isAltOpCopy() && !Bundle.value())) {
LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL0) ||
- !BS.getScheduleData(VL0)->isPartOfBundle()) &&
+ !BS.getScheduleData(VL0)->isPartOfBundle() || S.isAltOpCopy() ||
+ (BS.getScheduleData(VL0)->TE &&
+ BS.getScheduleData(VL0)->TE->isAltOpCopy())) &&
"tryScheduleBundle should cancelScheduling on failure");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
@@ -9078,7 +9376,73 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
TE->dump());
}
-
+ if (S.isAltOpCopy() && !isCopyableOp(VL, S.getMainOp(), S.getAltOp())) {
+ ValueList Left, Right;
+ unsigned Opcode0 = S.getOpcode();
+ unsigned Opcode1 = S.getAltOpcode();
+ unsigned Operand;
+ bool IsOperandSet = false;
+ ValueList newMainVL;
+ ValueList newVL;
+ for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+ Instruction *Inst = cast<Instruction>(VL[I]);
+ if (Inst->getOpcode() == Opcode0) {
+ newMainVL.push_back(VL[I]);
+ unsigned Op = 0;
+ Instruction *Inst1 = dyn_cast<Instruction>(Inst->getOperand(Op));
+ if (!Inst1) {
+ newVL.push_back(Inst->getOperand(Op));
+ continue;
+ }
+ if (IsOperandSet && Op != Operand)
+ return;
+ if (Inst1->getOpcode() == Opcode1) {
+ if (!IsOperandSet) {
+ Operand = Op;
+ IsOperandSet = true;
+ }
+ }
+ newVL.push_back(Inst1);
+ } else if (Inst->getOpcode() == Opcode1) {
+ newVL.push_back(Inst);
+ }
+ }
+ VLOperands Ops(VL, S, *this);
+ Left = Ops.getVL(0);
+ Right = Ops.getVL(1);
+ for (unsigned I = 0, VF = VL.size(); I < VF; ++I)
+ if ((cast<Instruction>(VL[I]))->getOpcode() != Opcode0) {
+ Right[I] = ConstantExpr::getBinOpIdentity(
+ Opcode0, Right[0]->getType(), true);
+ }
+ TE->setOperand(0, newVL);
+ TE->setOperand(1, Right);
+ buildTree_rec(newVL, Depth + 1, {TE, 0});
+ buildTree_rec(Right, Depth + 1, {TE, 1});
+ return;
+ } else if (S.isAltOpCopy()) {
+ ValueList Left, Right;
+ unsigned Opcode0 = S.getOpcode();
+ VLOperands Ops(VL, S, *this);
+ Left = Ops.getVL(0);
+ Right = Ops.getVL(1);
+ ValueList Left_new, Right_new;
+ for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+ if ((cast<Instruction>(VL[I]))->getOpcode() != Opcode0) {
+ Left_new.push_back(VL[I]);
+ Right_new.push_back(ConstantExpr::getBinOpIdentity(
+ Opcode0, S.getMainOp()->getType(), true));
+ } else {
+ Left_new.push_back(Left[I]);
+ Right_new.push_back(Right[I]);
+ }
+ }
+ TE->setOperand(0, Left_new);
+ TE->setOperand(1, Right_new);
+ buildTree_rec(Left_new, Depth + 1, {TE, 0});
+ buildTree_rec(Right_new, Depth + 1, {TE, 1});
+ return;
+ }
// Reorder operands if reordering would enable vectorization.
auto *CI = dyn_cast<CmpInst>(VL0);
if (CI && any_of(VL, [](Value *V) {
@@ -11344,8 +11708,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
E->getMainOp()->getType()->isPointerTy())) &&
"Invalid VL");
Instruction *VL0 = E->getMainOp();
- unsigned ShuffleOrOp =
- E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+ unsigned ShuffleOrOp = (E->isAltShuffle() && !E->isAltOpCopy())
+ ? (unsigned)Instruction::ShuffleVector
+ : E->getOpcode();
if (E->CombinedOp != TreeEntry::NotCombinedOp)
ShuffleOrOp = E->CombinedOp;
SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
@@ -11992,7 +12357,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
Instruction::isBinaryOp(E->getAltOpcode())) ||
(Instruction::isCast(E->getOpcode()) &&
Instruction::isCast(E->getAltOpcode())) ||
- (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
+ (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp())) ||
+ E->isAltOpCopy()) &&
"Invalid Shuffle Vector Operand");
// Try to find the previous shuffle node with the same operands and same
// main/alternate ops.
@@ -12780,6 +13146,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
std::optional<unsigned> InsertIdx = getElementIndex(VU);
if (InsertIdx) {
const TreeEntry *ScalarTE = &EU.E;
+ if (!ScalarTE)
+ continue;
auto *It = find_if(
ShuffledInserts,
[this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
@@ -12862,8 +13230,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
EU.Lane, EU.Scalar, ScalarUserAndIdx);
}
// Leave the scalar instructions as is if they are cheaper than extracts.
- if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
- Entry->getOpcode() == Instruction::Load) {
+ if (Entry &&
+ (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
+ Entry->getOpcode() == Instruction::Load)) {
// Checks if the user of the external scalar is phi in loop body.
auto IsPhiInLoop = [&](const ExternalUser &U) {
if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
@@ -14128,13 +14497,20 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
// VL.back() and iterate over schedule data until we reach the end of the
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB) && !E->isGather()) {
- Value *V = E->isOneOf(E->Scalars.back());
+ Value *V = E->getMainOp();
if (doesNotNeedToBeScheduled(V))
V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
- auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
- if (Bundle && Bundle->isPartOfBundle())
+ auto *Bundle = BlocksSchedules[BB]->getScheduleData(V, E);
+ if (Bundle && Bundle->isPartOfBundle()) {
+ if (any_of(E->Scalars, [&](Value *V) {
+ return (!doesNotNeedToBeScheduled(V) && CopyableAltOp.contains(V));
+ }))
+ Bundle = Bundle->FirstInBundle;
for (; Bundle; Bundle = Bundle->NextInBundle)
- Res = Bundle->Inst;
+ if (!CopyableAltOp.contains(Bundle->Inst) &&
+ !doesNotNeedToBeScheduled(Bundle->Inst))
+ Res = Bundle->Inst;
+ }
}
// LastInst can still be null at this point if there's either not an entry
@@ -14876,8 +15252,12 @@ BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
const InstructionsState &S) {
if (!S)
return nullptr;
- if (TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL);
- VE && VE->UserTreeIndex.UserTE == E &&
+ TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL);
+ if (VE && VE->UserTreeIndex.UserTE == E &&
+ VE->UserTreeIndex.EdgeIdx == NodeIdx)
+ return VE;
+ VE = getSameValuesTreeEntry(S.getAltOp(), VL);
+ if (VE && VE->isAltOpCopy() && VE->UserTreeIndex.UserTE == E &&
VE->UserTreeIndex.EdgeIdx == NodeIdx)
return VE;
return nullptr;
@@ -16594,6 +16974,8 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
if (User && !is_contained(Scalar->users(), User))
continue;
const TreeEntry *E = &ExternalUse.E;
+ if (!E && CopyableAltOp.contains(Scalar))
+ continue;
assert(E && "Invalid scalar");
assert(!E->isGather() && "Extracting from a gather list");
// Non-instruction pointers are not deleted, just skip them.
@@ -16985,6 +17367,9 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
continue;
if (isa<PoisonValue>(Scalar))
continue;
+ if (Entry->isAltOpCopy() &&
+ cast<Instruction>(Scalar)->getOpcode() != Entry->getOpcode())
+ continue;
#ifndef NDEBUG
Type *Ty = Scalar->getType();
if (!Ty->isVoidTy()) {
@@ -17221,14 +17606,59 @@ void BoUpSLP::optimizeGatherSequence() {
GatherShuffleExtractSeq.clear();
}
-BoUpSLP::ScheduleData *
-BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
+BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::buildBundle(
+ ArrayRef<Value *> VL, const InstructionsState &S, bool &ReSchedule) {
ScheduleData *Bundle = nullptr;
ScheduleData *PrevInBundle = nullptr;
+ unsigned Opcode = S.getOpcode();
+ ValueList Keys;
+
for (Value *V : VL) {
+ auto *SD = getScheduleData(V);
+ bool FoundKey = false;
+ if (SD && !SD->isPartOfBundle()) {
+ Keys.push_back(V);
+ continue;
+ }
+ for (Value *Key : VL) {
+ SD = getScheduleData(V, Key);
+ if (SD && SD->isPartOfBundle()) {
+ ReSchedule = true;
+ } else if (!SD || !SD->isPartOfBundle()) {
+ FoundKey = true;
+ Keys.push_back(Key);
+ break;
+ }
+ }
+ if (!FoundKey) {
+ for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E;
+ ++It) {
+ Value *Key = &*It;
+ if (!Key)
+ continue;
+ SD = getScheduleData(V, Key);
+ if (!SD || !SD->isPartOfBundle()) {
+ FoundKey = true;
+ Keys.push_back(Key);
+ break;
+ }
+ }
+ }
+ }
+ for (auto [V, Key] : zip(VL, Keys)) {
if (doesNotNeedToBeScheduled(V))
continue;
- ScheduleData *BundleMember = getScheduleData(V);
+ Instruction *I = dyn_cast<Instruction>(V);
+ bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
+ ScheduleData *BundleMember = getScheduleData(V, Key);
+ if (V != Key) {
+ ScheduleData *SD = allocateScheduleDataChunks();
+ Instruction *I = dyn_cast<Instruction>(V);
+ SD->Inst = I;
+ SD->init(SchedulingRegionID, Key);
+ ScheduleDataMap[I][Key] = SD;
+ BundleMember = getScheduleData(V, Key);
+ }
assert(BundleMember &&
"no ScheduleData for bundle member "
"(maybe not in same basic block)");
@@ -17242,6 +17672,8 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
// Group the instructions to a bundle.
BundleMember->FirstInBundle = Bundle;
+ if (S.isAltOpCopy() && IsAltInst)
+ BundleMember->IsCopy = true;
PrevInBundle = BundleMember;
}
assert(Bundle && "Failed to find schedule bundle");
@@ -17252,7 +17684,8 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
// and schedules instructions until the bundle gets ready.
std::optional<BoUpSLP::ScheduleData *>
BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
- const InstructionsState &S) {
+ const InstructionsState &S,
+ bool AnyCopies) {
// No need to schedule PHIs, insertelement, extractelement and extractvalue
// instructions.
if (isa<PHINode>(S.getMainOp()) ||
@@ -17261,19 +17694,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// Initialize the instruction bundle.
Instruction *OldScheduleEnd = ScheduleEnd;
+ bool IsAltOpCopy = S.isAltOpCopy();
LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
- auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
- ScheduleData *Bundle) {
+ auto TryScheduleBundleImpl = [this, OldScheduleEnd, IsAltOpCopy, AnyCopies,
+ SLP](bool ReSchedule, ScheduleData *Bundle) {
// The scheduling region got new instructions at the lower end (or it is a
// new region for the first bundle). This makes it necessary to
// recalculate all dependencies.
// It is seldom that this needs to be done a second time after adding the
// initial bundle to the region.
- if (ScheduleEnd != OldScheduleEnd) {
+ if (ScheduleEnd != OldScheduleEnd || IsAltOpCopy || AnyCopies) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
- if (ScheduleData *SD = getScheduleData(I))
- SD->clearDependencies();
+ doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
ReSchedule = true;
}
if (Bundle) {
@@ -17339,24 +17772,34 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
ReSchedule = true;
}
- auto *Bundle = buildBundle(VL);
+ auto *Bundle = buildBundle(VL, S, ReSchedule);
+ if (!Bundle)
+ return std::nullopt;
TryScheduleBundleImpl(ReSchedule, Bundle);
if (!Bundle->isReady()) {
- cancelScheduling(VL, S.getMainOp());
+ cancelScheduling(VL, Bundle);
+ // In case we have any copyable element then we have to clear
+ // all dependencies, since all values were calculated for
+ // the vectorized bundles with copies.
+ if (AnyCopies || IsAltOpCopy) {
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
+ doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
+ resetSchedule();
+ }
return std::nullopt;
}
return Bundle;
}
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
- Value *OpValue) {
- if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
- doesNotNeedToSchedule(VL))
+ ScheduleData *Bundle) {
+ if (isa<PHINode>(VL.front()) || isVectorLikeInstWithConstOps(VL.front()) ||
+ doesNotNeedToSchedule(VL) || !Bundle)
return;
- if (doesNotNeedToBeScheduled(OpValue))
- OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
- ScheduleData *Bundle = getScheduleData(OpValue);
+ if (Bundle->FirstInBundle)
+ Bundle = Bundle->FirstInBundle;
+
LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
assert(!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled");
@@ -17376,9 +17819,17 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
ScheduleData *Next = BundleMember->NextInBundle;
BundleMember->NextInBundle = nullptr;
BundleMember->TE = nullptr;
+ BundleMember->IsCopy = false;
if (BundleMember->unscheduledDepsInBundle() == 0) {
ReadyInsts.insert(BundleMember);
}
+ auto I = ScheduleDataMap.find(BundleMember->Inst);
+ if (I != ScheduleDataMap.end()) {
+ for (auto &SD : I->second) {
+ if (SD.second == BundleMember && SD.first != BundleMember->Inst)
+ ScheduleDataMap[BundleMember->Inst].erase(SD.first);
+ }
+ }
BundleMember = Next;
}
}
@@ -17394,19 +17845,34 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V, const InstructionsState &S) {
+ if (getScheduleData(V, S.getMainOp()))
+ return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
!doesNotNeedToBeScheduled(I) &&
"phi nodes/insertelements/extractelements/extractvalues don't need to "
"be scheduled");
- if (getScheduleData(I))
+ auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
+ ScheduleData *ISD = getScheduleData(I);
+ if (!ISD)
+ return false;
+ assert(isInSchedulingRegion(ISD) &&
+ "ScheduleData not in scheduling region");
+ ScheduleData *SD = allocateScheduleDataChunks();
+ SD->Inst = I;
+ SD->init(SchedulingRegionID, S.getMainOp());
+ return true;
+ };
+ if (CheckScheduleForI(I))
return true;
if (!ScheduleStart) {
// It's the first instruction in the new region.
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
ScheduleStart = I;
ScheduleEnd = I->getNextNode();
+ if (isOneOf(S, I) != I)
+ CheckScheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a terminator?");
LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
return true;
@@ -17445,6 +17911,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
"Instruction is in wrong basic block.");
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
+ if (isOneOf(S, I) != I)
+ CheckScheduleForI(I);
LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
<< "\n");
return true;
@@ -17457,6 +17925,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
nullptr);
ScheduleEnd = I->getNextNode();
+ if (isOneOf(S, I) != I)
+ CheckScheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a terminator?");
LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
return true;
@@ -17471,10 +17941,14 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
// No need to allocate data for non-schedulable instructions.
if (doesNotNeedToBeScheduled(I))
continue;
- ScheduleData *SD = ScheduleDataMap.lookup(I);
+ ScheduleData *SD = nullptr;
+ auto It = ScheduleDataMap.find(I);
+ if (It != ScheduleDataMap.end())
+ SD = It->second.lookup(I);
if (!SD) {
SD = allocateScheduleDataChunks();
- ScheduleDataMap[I] = SD;
+ ScheduleDataMap[I][I] = SD;
+ SD->Inst = I;
}
assert(!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region");
@@ -17516,11 +17990,20 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
while (!WorkList.empty()) {
ScheduleData *SD = WorkList.pop_back_val();
+ bool ResetDeps = false;
+ for (ScheduleData *BundleMember = SD; BundleMember;
+ BundleMember = BundleMember->NextInBundle)
+ if (!BundleMember->hasValidDependencies())
+ ResetDeps = true;
+
for (ScheduleData *BundleMember = SD; BundleMember;
BundleMember = BundleMember->NextInBundle) {
assert(isInSchedulingRegion(BundleMember));
- if (BundleMember->hasValidDependencies())
+ if (BundleMember->hasValidDependencies()) {
+ if (ResetDeps)
+ BundleMember->resetUnscheduledDeps();
continue;
+ }
LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
<< "\n");
@@ -17529,26 +18012,32 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
// Handle def-use chain dependencies.
for (User *U : BundleMember->Inst->users()) {
- if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
+ if (auto *I = dyn_cast<Instruction>(U)) {
+ doForAllOpcodes(I, [&](ScheduleData *UseSD) {
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if ((UseSD->IsCopy || BundleMember->IsCopy) &&
+ DestBundle == BundleMember->FirstInBundle)
+ return;
+ BundleMember->Dependencies++;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ });
+ }
+ }
+
+ auto MakeControlDependent = [&](Instruction *I) {
+ doForAllOpcodes(I, [&](ScheduleData *DepDest) {
+ assert(DepDest && "must be in schedule window");
+ DepDest->ControlDependencies.push_back(BundleMember);
BundleMember->Dependencies++;
- ScheduleData *DestBundle = UseSD->FirstInBundle;
+ ScheduleData *DestBundle = DepDest->FirstInBundle;
if (!DestBundle->IsScheduled)
BundleMember->incrementUnscheduledDeps(1);
if (!DestBundle->hasValidDependencies())
WorkList.push_back(DestBundle);
- }
- }
-
- auto MakeControlDependent = [&](Instruction *I) {
- auto *DepDest = getScheduleData(I);
- assert(DepDest && "must be in schedule window");
- DepDest->ControlDependencies.push_back(BundleMember);
- BundleMember->Dependencies++;
- ScheduleData *DestBundle = DepDest->FirstInBundle;
- if (!DestBundle->IsScheduled)
- BundleMember->incrementUnscheduledDeps(1);
- if (!DestBundle->hasValidDependencies())
- WorkList.push_back(DestBundle);
+ });
};
// Any instruction which isn't safe to speculate at the beginning of the
@@ -17684,12 +18173,12 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
assert(ScheduleStart &&
"tried to reset schedule on block which has not been scheduled");
for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- if (ScheduleData *SD = getScheduleData(I)) {
+ doForAllOpcodes(I, [&](ScheduleData *SD) {
assert(isInSchedulingRegion(SD) &&
"ScheduleData not in scheduling region");
SD->IsScheduled = false;
SD->resetUnscheduledDeps();
- }
+ });
}
ReadyInsts.clear();
}
@@ -17718,44 +18207,99 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
}
};
std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
+ SmallDenseMap<Value *, ScheduleData *> CopyElementsMap;
// Ensure that all dependency data is updated (for nodes in the sub-graph)
// and fill the ready-list with initial instructions.
int Idx = 0;
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
- if (ScheduleData *SD = BS->getScheduleData(I)) {
+ BS->doForAllOpcodes(I, [this, &Idx, &CopyElementsMap,
+ BS](ScheduleData *SD) {
[[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(SD->Inst);
- assert((isVectorLikeInstWithConstOps(SD->Inst) ||
+ assert((isVectorLikeInstWithConstOps(SD->Inst) || SD->IsCopy ||
SD->isPartOfBundle() ==
(!SDTEs.empty() &&
!doesNotNeedToSchedule(SDTEs.front()->Scalars))) &&
"scheduler and vectorizer bundle mismatch");
SD->FirstInBundle->SchedulingPriority = Idx++;
+ for (TreeEntry *SDTE : SDTEs)
+ if (SDTE && SDTE->isAltOpCopy()) {
+ ScheduleData *Bundle = SD->FirstInBundle;
+ for (ScheduleData *BundleMember = Bundle; BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
+ if (BundleMember->IsCopy)
+ CopyElementsMap[BundleMember->Inst] = Bundle;
+ }
+ }
if (SD->isSchedulingEntity() && SD->isPartOfBundle())
BS->calculateDependencies(SD, false, this);
- }
+ });
}
BS->initialFillReadyList(ReadyInsts);
Instruction *LastScheduledInst = BS->ScheduleEnd;
+ DenseMap<ScheduleData *, ScheduleData *> ReschedMap;
+
+ auto ReorderBundle = [this](ScheduleData *SD) {
+ SmallVector<Instruction *, 2> Insts;
+ TreeEntry *SDTE = SD->TE;
+ if (SDTE && SDTE->isAltOpCopy()) {
+ unsigned Opcode = SD->TE->getOpcode();
+ for (ScheduleData *BundleMember = SD; BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
+ if (BundleMember->Inst->getOpcode() != Opcode) {
+ Insts.push_back(BundleMember->Inst);
+ } else {
+ Insts.insert(Insts.begin(), BundleMember->Inst);
+ }
+ }
+ } else {
+ SmallVector<Instruction *, 2> InstrSched;
+ for (ScheduleData *BundleMember = SD; BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
+ if (CopyableAltOp.contains(BundleMember->Inst))
+ Insts.insert(Insts.begin(), BundleMember->Inst);
+ else
+ Insts.push_back(BundleMember->Inst);
+ }
+ }
+ return Insts;
+ };
// Do the "real" scheduling.
while (!ReadyInsts.empty()) {
ScheduleData *Picked = *ReadyInsts.begin();
ReadyInsts.erase(ReadyInsts.begin());
- // Move the scheduled instruction(s) to their dedicated places, if not
- // there yet.
+ // Reorder copyable elements to emit after main operations.
for (ScheduleData *BundleMember = Picked; BundleMember;
BundleMember = BundleMember->NextInBundle) {
- Instruction *PickedInst = BundleMember->Inst;
- if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
+ if (CopyableAltOp.contains(BundleMember->Inst)) {
+ ScheduleData *SD = CopyElementsMap[BundleMember->Inst];
+ if (SD && SD->FirstInBundle != Picked)
+ ReschedMap[SD] = Picked;
+ }
+ }
+
+ // Move the scheduled instruction(s) to their dedicated places, if not
+ // there yet.
+ for (Instruction *PickedInst : ReorderBundle(Picked)) {
+ if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst &&
+ LastScheduledInst->getPrevNode())
PickedInst->moveAfter(LastScheduledInst->getPrevNode());
LastScheduledInst = PickedInst;
}
-
+ if (ReschedMap.contains(Picked)) {
+ ScheduleData *Resched = ReschedMap[Picked];
+ for (Instruction *PickedInst : ReorderBundle(Resched)) {
+ if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst &&
+ LastScheduledInst->getPrevNode())
+ PickedInst->moveAfter(LastScheduledInst->getPrevNode());
+ LastScheduledInst = PickedInst;
+ }
+ }
BS->schedule(Picked, ReadyInsts);
}
@@ -17767,9 +18311,10 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
// Check that all schedulable entities got scheduled
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
- ScheduleData *SD = BS->getScheduleData(I);
- if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
- assert(SD->IsScheduled && "must be scheduled at this point");
+ BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
+ if (SD->isSchedulingEntity() && SD->hasValidDependencies())
+ assert(SD->IsScheduled && "must be scheduled at this point");
+ });
}
#endif
@@ -17880,6 +18425,9 @@ bool BoUpSLP::collectValuesToDemote(
if (NodesToKeepBWs.contains(E.Idx))
return false;
+ if (E.isAltOpCopy())
+ return false;
+
// If the value is not a vectorized instruction in the expression and not used
// by the insertelement instruction and not used in multiple vector nodes, it
// cannot be demoted.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 869a9d1aee80e..7fa746dc758a9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -slp-vectorize-copyable=true -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,COPYABLE %s
define void @add0(ptr noalias %dst, ptr noalias %src) {
; CHECK-LABEL: @add0(
@@ -60,6 +61,13 @@ define void @add1(ptr noalias %dst, ptr noalias %src) {
; POW2-ONLY-NEXT: store i32 [[ADD9]], ptr [[INCDEC_PTR7]], align 4
; POW2-ONLY-NEXT: ret void
;
+; COPYABLE-LABEL: @add1(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
+; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: ret void
+;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
%0 = load i32, ptr %src, align 4
@@ -82,21 +90,44 @@ entry:
}
define void @sub0(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @sub0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @sub0(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; NON-POW2-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; NON-POW2-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; NON-POW2-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @sub0(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @sub0(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 -1, i32 0, i32 -2, i32 -3>
+; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -180,23 +211,55 @@ entry:
}
define void @addsub0(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @addsub0(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; NON-POW2-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; NON-POW2-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; NON-POW2-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @addsub0(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @addsub0(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 0>
+; COPYABLE-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; COPYABLE-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; COPYABLE-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; COPYABLE-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -220,23 +283,55 @@ entry:
}
define void @addsub1(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; CHECK-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
-; CHECK-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @addsub1(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; NON-POW2-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; NON-POW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; NON-POW2-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
+; NON-POW2-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @addsub1(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
+; POW2-ONLY-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @addsub1(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; COPYABLE-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; COPYABLE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; COPYABLE-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT: [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP4]], <i32 0, i32 -3>
+; COPYABLE-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -260,21 +355,44 @@ entry:
}
define void @mul(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @mul(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
-; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; CHECK-NEXT: store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
-; CHECK-NEXT: store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @mul(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
+; NON-POW2-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; NON-POW2-NEXT: [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; NON-POW2-NEXT: store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; NON-POW2-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; NON-POW2-NEXT: store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @mul(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT: store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; POW2-ONLY-NEXT: store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @mul(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[TMP0]], <i32 257, i32 -3, i32 1, i32 -9>
+; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -325,6 +443,13 @@ define void @shl0(ptr noalias %dst, ptr noalias %src) {
; POW2-ONLY-NEXT: store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
; POW2-ONLY-NEXT: ret void
;
+; COPYABLE-LABEL: @shl0(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
+; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: ret void
+;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
%0 = load i32, ptr %src, align 4
@@ -434,6 +559,13 @@ define void @add1f(ptr noalias %dst, ptr noalias %src) {
; POW2-ONLY-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
; POW2-ONLY-NEXT: ret void
;
+; COPYABLE-LABEL: @add1f(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; COPYABLE-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: ret void
+;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
%0 = load float, ptr %src, align 4
@@ -456,21 +588,44 @@ entry:
}
define void @sub0f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @sub0f(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT: store float [[ADD]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @sub0f(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT: store float [[ADD]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; NON-POW2-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; NON-POW2-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @sub0f(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT: store float [[ADD]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @sub0f(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; COPYABLE-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -554,23 +709,55 @@ entry:
}
define void @addsub0f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub0f(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT: store float [[SUB]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @addsub0f(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT: store float [[SUB]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; NON-POW2-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; NON-POW2-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @addsub0f(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT: store float [[SUB]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @addsub0f(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00>
+; COPYABLE-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; COPYABLE-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; COPYABLE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; COPYABLE-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -594,23 +781,55 @@ entry:
}
define void @addsub1f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub1f(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
-; CHECK-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @addsub1f(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; NON-POW2-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; NON-POW2-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; NON-POW2-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
+; NON-POW2-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @addsub1f(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
+; POW2-ONLY-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @addsub1f(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; COPYABLE-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; COPYABLE-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; COPYABLE-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP4]], <float 0.000000e+00, float -3.000000e+00>
+; COPYABLE-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -634,21 +853,44 @@ entry:
}
define void @mulf(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @mulf(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @mulf(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; NON-POW2-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; NON-POW2-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; NON-POW2-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; NON-POW2-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; NON-POW2-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @mulf(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; POW2-ONLY-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @mulf(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = fmul fast <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
+; COPYABLE-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -729,6 +971,22 @@ define void @add1fn(ptr noalias %dst, ptr noalias %src) {
; POW2-ONLY-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
; POW2-ONLY-NEXT: ret void
;
+; COPYABLE-LABEL: @add1fn(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT: store float [[TMP0]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; COPYABLE-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; COPYABLE-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; COPYABLE-NEXT: store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; COPYABLE-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; COPYABLE-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; COPYABLE-NEXT: ret void
+;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
%0 = load float, ptr %src, align 4
@@ -849,21 +1107,49 @@ entry:
}
define void @mulfn(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @mulfn(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @mulfn(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; NON-POW2-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; NON-POW2-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; NON-POW2-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; NON-POW2-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; NON-POW2-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @mulfn(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; POW2-ONLY-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @mulfn(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; COPYABLE-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], <float 1.000000e+00, float -9.000000e+00>
+; COPYABLE-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -885,3 +1171,154 @@ entry:
store float %sub9, ptr %incdec.ptr7, align 4
ret void
}
+
+define void @and_lshr(ptr %0, ptr %1, float %2, float %3) {
+; NON-POW2-LABEL: @and_lshr(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP1:%.*]], align 1
+; NON-POW2-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; NON-POW2-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP5]], 2
+; NON-POW2-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP5]], 4
+; NON-POW2-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; NON-POW2-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP6]], i32 1
+; NON-POW2-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP7]], i32 2
+; NON-POW2-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+; NON-POW2-NEXT: [[TMP12:%.*]] = and <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; NON-POW2-NEXT: [[TMP13:%.*]] = lshr <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; NON-POW2-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; NON-POW2-NEXT: [[TMP15:%.*]] = sitofp <4 x i32> [[TMP14]] to <4 x float>
+; NON-POW2-NEXT: [[TMP16:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 4
+; NON-POW2-NEXT: [[TMP17:%.*]] = insertelement <4 x float> poison, float [[TMP3:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP18:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> poison, <4 x i32> zeroinitializer
+; NON-POW2-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP16]], [[TMP18]]
+; NON-POW2-NEXT: [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP2:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <4 x i32> zeroinitializer
+; NON-POW2-NEXT: [[TMP22:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP21]], <4 x float> [[TMP15]], <4 x float> [[TMP19]])
+; NON-POW2-NEXT: store <4 x float> [[TMP22]], ptr [[TMP0]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @and_lshr(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP1:%.*]], align 1
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP5]], 2
+; POW2-ONLY-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP5]], 4
+; POW2-ONLY-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP6]], i32 1
+; POW2-ONLY-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP7]], i32 2
+; POW2-ONLY-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+; POW2-ONLY-NEXT: [[TMP12:%.*]] = and <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; POW2-ONLY-NEXT: [[TMP13:%.*]] = lshr <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; POW2-ONLY-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; POW2-ONLY-NEXT: [[TMP15:%.*]] = sitofp <4 x i32> [[TMP14]] to <4 x float>
+; POW2-ONLY-NEXT: [[TMP16:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 4
+; POW2-ONLY-NEXT: [[TMP17:%.*]] = insertelement <4 x float> poison, float [[TMP3:%.*]], i32 0
+; POW2-ONLY-NEXT: [[TMP18:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> poison, <4 x i32> zeroinitializer
+; POW2-ONLY-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP16]], [[TMP18]]
+; POW2-ONLY-NEXT: [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP2:%.*]], i32 0
+; POW2-ONLY-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <4 x i32> zeroinitializer
+; POW2-ONLY-NEXT: [[TMP22:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP21]], <4 x float> [[TMP15]], <4 x float> [[TMP19]])
+; POW2-ONLY-NEXT: store <4 x float> [[TMP22]], ptr [[TMP0]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @and_lshr(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP1:%.*]], align 1
+; COPYABLE-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; COPYABLE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; COPYABLE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
+; COPYABLE-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 0, i32 2, i32 4, i32 6>
+; COPYABLE-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], <i32 3, i32 3, i32 3, i32 -1>
+; COPYABLE-NEXT: [[TMP10:%.*]] = sitofp <4 x i32> [[TMP9]] to <4 x float>
+; COPYABLE-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 4
+; COPYABLE-NEXT: [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP3:%.*]], i32 0
+; COPYABLE-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> poison, <4 x i32> zeroinitializer
+; COPYABLE-NEXT: [[TMP14:%.*]] = fadd <4 x float> [[TMP11]], [[TMP13]]
+; COPYABLE-NEXT: [[TMP15:%.*]] = insertelement <4 x float> poison, float [[TMP2:%.*]], i32 0
+; COPYABLE-NEXT: [[TMP16:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> poison, <4 x i32> zeroinitializer
+; COPYABLE-NEXT: [[TMP17:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP16]], <4 x float> [[TMP10]], <4 x float> [[TMP14]])
+; COPYABLE-NEXT: store <4 x float> [[TMP17]], ptr [[TMP0]], align 4
+; COPYABLE-NEXT: ret void
+;
+entry:
+ %5 = getelementptr inbounds float, ptr %0, i64 1
+ %6 = getelementptr inbounds float, ptr %0, i64 2
+ %7 = getelementptr inbounds float, ptr %0, i64 3
+ %8 = load i8, ptr %1, align 1
+ %9 = zext i8 %8 to i32
+ %10 = and i32 %9, 3
+ %11 = sitofp i32 %10 to float
+ %12 = lshr i32 %9, 2
+ %13 = and i32 %12, 3
+ %14 = sitofp i32 %13 to float
+ %15 = lshr i32 %9, 4
+ %16 = and i32 %15, 3
+ %17 = sitofp i32 %16 to float
+ %18 = lshr i32 %9, 6
+ %19 = sitofp i32 %18 to float
+ %20 = load float, ptr %0, align 4
+ %21 = fadd float %20, %3
+ %22 = tail call noundef float @llvm.fma.f32(float %2, float %11, float %21)
+ store float %22, ptr %0, align 4
+ %23 = load float, ptr %5, align 4
+ %24 = fadd float %23, %3
+ %25 = tail call noundef float @llvm.fma.f32(float %2, float %14, float %24)
+ store float %25, ptr %5, align 4
+ %26 = load float, ptr %6, align 4
+ %27 = fadd float %26, %3
+ %28 = tail call noundef float @llvm.fma.f32(float %2, float %17, float %27)
+ store float %28, ptr %6, align 4
+ %29 = load float, ptr %7, align 4
+ %30 = fadd float %29, %3
+ %31 = tail call noundef float @llvm.fma.f32(float %2, float %19, float %30)
+ store float %31, ptr %7, align 4
+ ret void
+}
+
+define void @add_shl(ptr %sinfo) {
+; NON-POW2-LABEL: @add_shl(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[SHL_I:%.*]] = shl i32 0, 0
+; NON-POW2-NEXT: [[END_CODE_I:%.*]] = getelementptr i8, ptr [[SINFO:%.*]], i64 348
+; NON-POW2-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[SHL_I]], i32 3
+; NON-POW2-NEXT: [[TMP1:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]]
+; NON-POW2-NEXT: [[TMP2:%.*]] = shl <4 x i32> zeroinitializer, [[TMP0]]
+; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; NON-POW2-NEXT: store <4 x i32> [[TMP3]], ptr [[END_CODE_I]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @add_shl(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[SHL_I:%.*]] = shl i32 0, 0
+; POW2-ONLY-NEXT: [[END_CODE_I:%.*]] = getelementptr i8, ptr [[SINFO:%.*]], i64 348
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[SHL_I]], i32 3
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]]
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = shl <4 x i32> zeroinitializer, [[TMP0]]
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; POW2-ONLY-NEXT: store <4 x i32> [[TMP3]], ptr [[END_CODE_I]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @add_shl(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[END_CODE_I:%.*]] = getelementptr i8, ptr [[SINFO:%.*]], i64 348
+; COPYABLE-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, <2 x i32> zeroinitializer, i64 2)
+; COPYABLE-NEXT: [[TMP1:%.*]] = add <4 x i32> [[TMP0]], zeroinitializer
+; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[END_CODE_I]], align 4
+; COPYABLE-NEXT: ret void
+;
+entry:
+ %shl.i = shl i32 0, 0
+ %add.i = add i32 0, 0
+ %end_code.i = getelementptr i8, ptr %sinfo, i64 348
+ store i32 %add.i, ptr %end_code.i, align 4
+ %add.i.i = add i32 0, 0
+ %code_size.i.i = getelementptr i8, ptr %sinfo, i64 352
+ store i32 %add.i.i, ptr %code_size.i.i, align 8
+ %shl.i.i = shl i32 0, 0
+ %limit_code.i.i = getelementptr i8, ptr %sinfo, i64 356
+ store i32 %shl.i.i, ptr %limit_code.i.i, align 4
+ %add2.i.i = add i32 %shl.i, 0
+ %max_code.i.i = getelementptr i8, ptr %sinfo, i64 360
+ store i32 %add2.i.i, ptr %max_code.i.i, align 8
+ ret void
+}
>From d0dc242f1b18aba43fa57062b7f4b7e5ffa593e5 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at gmail.com>
Date: Wed, 26 Feb 2025 13:11:56 +0000
Subject: [PATCH 2/3] Add method to schedule copy instructions in
BlockScheduling.schedule(), removed ReschedMap from BoUpSLP::scheduleBlock().
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 80 +++++++++++--------
1 file changed, 48 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5225eb2b2eefa..6b6c722ad7259 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3835,16 +3835,16 @@ class BoUpSLP {
continue;
auto It = ScalarToTreeEntries.find(V);
Instruction *I = dyn_cast<Instruction>(V);
- bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
- if (S.isAltOpCopy() && IsAltInst) {
- CopyableAltOp.insert(V);
- continue;
- }
assert(
(It == ScalarToTreeEntries.end() ||
(It->getSecond().size() == 1 && It->getSecond().front() == Last) ||
doesNotNeedToBeScheduled(V)) &&
"Scalar already in tree!");
+ bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
+ if (S.isAltOpCopy() && IsAltInst) {
+ CopyableAltOp[V] = Last;
+ continue;
+ }
if (It == ScalarToTreeEntries.end()) {
ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
(void)Processed.insert(V);
@@ -3954,8 +3954,8 @@ class BoUpSLP {
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
- /// A set op scalars that we are considoring as copyable operations.
- ValueSet CopyableAltOp;
+ /// Maps a scalar copies to the its tree entry(ies).
+ SmallDenseMap<Value *, TreeEntry *> CopyableAltOp;
/// A set of first non-schedulable values.
ValueSet NonScheduledFirst;
@@ -4264,6 +4264,9 @@ class BoUpSLP {
/// True if this instruction is a copy.
bool IsCopy = false;
+
+ /// Points to where copyable instruction was introduced.
+ ScheduleData *CopyInst = nullptr;
};
#ifndef NDEBUG
@@ -4413,6 +4416,23 @@ class BoUpSLP {
for (Use &U : BundleMember->Inst->operands())
if (auto *I = dyn_cast<Instruction>(U.get()))
DecrUnsched(I);
+ // Handle a copy instruction dependencies.
+ if (TE && TE->isAltOpCopy() && BundleMember->IsCopy) {
+ doForAllOpcodes(BundleMember->Inst, [BundleMember, &ReadyList](
+ ScheduleData *CopyUse) {
+ if (BundleMember != CopyUse && CopyUse->hasValidDependencies() &&
+ CopyUse->incrementUnscheduledDeps(-1) == 0) {
+ ScheduleData *DepBundle = CopyUse->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ if (DepBundle->isReady()) {
+ ReadyList.insert(DepBundle);
+ LLVM_DEBUG(dbgs() << "SLP: gets ready (copyable): "
+ << *DepBundle << "\n");
+ }
+ }
+ });
+ }
}
// Handle the memory dependencies.
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
@@ -4498,8 +4518,8 @@ class BoUpSLP {
/// Build a bundle from the ScheduleData nodes corresponding to the
/// scalar instruction for each lane.
- ScheduleData *buildBundle(ArrayRef<Value *> VL, const InstructionsState &S,
- bool &ReSchedule);
+ ScheduleData *buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+ const InstructionsState &S, bool &ReSchedule);
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
@@ -17606,8 +17626,10 @@ void BoUpSLP::optimizeGatherSequence() {
GatherShuffleExtractSeq.clear();
}
-BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::buildBundle(
- ArrayRef<Value *> VL, const InstructionsState &S, bool &ReSchedule) {
+BoUpSLP::ScheduleData *
+BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+ const InstructionsState &S,
+ bool &ReSchedule) {
ScheduleData *Bundle = nullptr;
ScheduleData *PrevInBundle = nullptr;
unsigned Opcode = S.getOpcode();
@@ -17675,6 +17697,13 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::buildBundle(
if (S.isAltOpCopy() && IsAltInst)
BundleMember->IsCopy = true;
PrevInBundle = BundleMember;
+ if (SLP->CopyableAltOp.contains(I)) {
+ TreeEntry *TE = SLP->CopyableAltOp[I];
+ assert(TE && "Incorrect state");
+ ScheduleData *SD = getScheduleData(I, TE);
+ assert(SD && SD->IsCopy && "ScheduleData incorrect state");
+ BundleMember->CopyInst = SD;
+ }
}
assert(Bundle && "Failed to find schedule bundle");
return Bundle;
@@ -17772,7 +17801,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
ReSchedule = true;
}
- auto *Bundle = buildBundle(VL, S, ReSchedule);
+ auto *Bundle = buildBundle(VL, SLP, S, ReSchedule);
if (!Bundle)
return std::nullopt;
TryScheduleBundleImpl(ReSchedule, Bundle);
@@ -17820,6 +17849,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
BundleMember->NextInBundle = nullptr;
BundleMember->TE = nullptr;
BundleMember->IsCopy = false;
+ BundleMember->CopyInst = nullptr;
if (BundleMember->unscheduledDepsInBundle() == 0) {
ReadyInsts.insert(BundleMember);
}
@@ -18010,6 +18040,12 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
BundleMember->Dependencies = 0;
BundleMember->resetUnscheduledDeps();
+ // Handle copy instruction dependencies.
+ if (BundleMember->CopyInst) {
+ BundleMember->Dependencies++;
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+
// Handle def-use chain dependencies.
for (User *U : BundleMember->Inst->users()) {
if (auto *I = dyn_cast<Instruction>(U)) {
@@ -18240,7 +18276,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
BS->initialFillReadyList(ReadyInsts);
Instruction *LastScheduledInst = BS->ScheduleEnd;
- DenseMap<ScheduleData *, ScheduleData *> ReschedMap;
auto ReorderBundle = [this](ScheduleData *SD) {
SmallVector<Instruction *, 2> Insts;
@@ -18273,16 +18308,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
ScheduleData *Picked = *ReadyInsts.begin();
ReadyInsts.erase(ReadyInsts.begin());
- // Reorder copyable elements to emit after main operations.
- for (ScheduleData *BundleMember = Picked; BundleMember;
- BundleMember = BundleMember->NextInBundle) {
- if (CopyableAltOp.contains(BundleMember->Inst)) {
- ScheduleData *SD = CopyElementsMap[BundleMember->Inst];
- if (SD && SD->FirstInBundle != Picked)
- ReschedMap[SD] = Picked;
- }
- }
-
// Move the scheduled instruction(s) to their dedicated places, if not
// there yet.
for (Instruction *PickedInst : ReorderBundle(Picked)) {
@@ -18291,15 +18316,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
PickedInst->moveAfter(LastScheduledInst->getPrevNode());
LastScheduledInst = PickedInst;
}
- if (ReschedMap.contains(Picked)) {
- ScheduleData *Resched = ReschedMap[Picked];
- for (Instruction *PickedInst : ReorderBundle(Resched)) {
- if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst &&
- LastScheduledInst->getPrevNode())
- PickedInst->moveAfter(LastScheduledInst->getPrevNode());
- LastScheduledInst = PickedInst;
- }
- }
BS->schedule(Picked, ReadyInsts);
}
>From e9bd6d47df66d9cf45951d04b3b147fde10c5534 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at gmail.com>
Date: Mon, 3 Mar 2025 22:02:00 +0000
Subject: [PATCH 3/3] Resolved comments by removing IsAltOpCopy flag from
InstructionsState. Resotored original logic to handle just homogeneous
operations in getSameOpcode(). Removed checkCopyableInnerDep() with replacing
functionality() with replacing with schedular.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 356 ++++++-----
.../X86/vect_copyable_in_binops.ll | 579 ++++++------------
2 files changed, 352 insertions(+), 583 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6b6c722ad7259..24f47f5abd692 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -854,9 +854,6 @@ class InstructionsState {
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
- /// True if alterative operation is copy instruction.
- bool IsAltOpCopy = false;
-
public:
Instruction *getMainOp() const {
assert(valid() && "InstructionsState is invalid.");
@@ -873,13 +870,9 @@ class InstructionsState {
unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
- bool isAltOpCopy() const { return IsAltOpCopy; }
-
/// Some of the instructions in the list have alternate opcodes.
bool isAltShuffle() const { return getMainOp() != getAltOp(); }
- void setAltOpCopy(bool Val) { IsAltOpCopy = Val; }
-
bool isOpcodeOrAlt(Instruction *I) const {
unsigned CheckedOpcode = I->getOpcode();
return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
@@ -920,18 +913,17 @@ static bool isValidForAlternation(unsigned Opcode) {
return true;
}
-// Check for inner dependencies, we could not support such depenedies if it
-// comes from a main operaion, only from alternative or for now we ignore
-// alternative operations depenedies to any alternative.
-static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
- const InstructionsState &S);
-
-// Determine that the vector could be vectorized with copyable elements.
static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt);
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
const TargetLibraryInfo &TLI);
+static InstructionsState getCopyableOpcode(ArrayRef<Value *> VL,
+ const TargetLibraryInfo &TLI);
+
+static InstructionsState getCombinedOpcode(ArrayRef<Value *> VL,
+ const TargetLibraryInfo &TLI);
+
/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
/// compatible instructions or constants, or just some other regular values.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
@@ -981,51 +973,19 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
return InstructionsState::invalid();
Instruction *MainOp = cast<Instruction>(*It);
- Instruction *AltOp = MainOp;
- unsigned Opcode = MainOp->getOpcode();
- unsigned AltOpcode = Opcode;
- for (Value *V : iterator_range(It + 1, VL.end())) {
- Instruction *Inst = dyn_cast<Instruction>(V);
- if (!Inst)
- continue;
- unsigned VOpcode = Inst->getOpcode();
- if (Inst && AltOpcode == Opcode && !isa<PHINode>(Inst) &&
- VOpcode != Opcode && isValidForAlternation(VOpcode)) {
- AltOpcode = VOpcode;
- AltOp = Inst;
- break;
- }
- }
unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
(VL.size() == 2 && InstCnt < 2))
return InstructionsState::invalid();
- bool IsBinOp = isa<BinaryOperator>(MainOp);
- bool IsCopyable = false;
- if (MainOp && AltOp && MainOp != AltOp) {
- if (!IsBinOp && isa<BinaryOperator>(AltOp) && !isa<PHINode>(MainOp)) {
- std::swap(MainOp, AltOp);
- std::swap(AltOpcode, Opcode);
- IsBinOp = true;
- }
- IsCopyable = VectorizeCopyable && isCopyableOp(VL, MainOp, AltOp);
- if (IsCopyable && isa<CmpInst>(AltOp)) {
- Type *Ty0 = MainOp->getOperand(0)->getType();
- Type *Ty1 = AltOp->getOperand(0)->getType();
- if (Ty0 != Ty1)
- return InstructionsState::invalid();
- } else if (!IsCopyable) {
- MainOp = cast<Instruction>(*It);
- AltOp = MainOp;
- Opcode = MainOp->getOpcode();
- AltOpcode = Opcode;
- IsBinOp = isa<BinaryOperator>(MainOp);
- }
- }
+
bool IsCastOp = isa<CastInst>(MainOp);
+ bool IsBinOp = isa<BinaryOperator>(MainOp);
bool IsCmpOp = isa<CmpInst>(MainOp);
CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
: CmpInst::BAD_ICMP_PREDICATE;
+ Instruction *AltOp = MainOp;
+ unsigned Opcode = MainOp->getOpcode();
+ unsigned AltOpcode = Opcode;
bool SwappedPredsCompatible = IsCmpOp && [&]() {
SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
@@ -1081,12 +1041,12 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
AltOp = I;
continue;
}
- } else if ((IsCastOp || IsCopyable) && isa<CastInst>(I)) {
+ } else if (IsCastOp && isa<CastInst>(I)) {
Value *Op0 = MainOp->getOperand(0);
Type *Ty0 = Op0->getType();
Value *Op1 = I->getOperand(0);
Type *Ty1 = Op1->getType();
- if (Ty0 == Ty1 || IsCopyable) {
+ if (Ty0 == Ty1) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
if (Opcode == AltOpcode) {
@@ -1098,15 +1058,13 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
continue;
}
}
- } else if (auto *Inst = dyn_cast<CmpInst>(I);
- Inst && (IsCmpOp || IsCopyable)) {
+ } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
auto *BaseInst = cast<CmpInst>(MainOp);
Type *Ty0 = BaseInst->getOperand(0)->getType();
Type *Ty1 = Inst->getOperand(0)->getType();
if (Ty0 == Ty1) {
- assert((IsCopyable || InstOpcode == Opcode) &&
- "Expected same CmpInst opcode.");
- assert((IsCopyable || InstOpcode == AltOpcode) &&
+ assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
+ assert(InstOpcode == AltOpcode &&
"Alternate instructions are only supported by BinaryOperator "
"and CastInst.");
// Check for compatible operands. If the corresponding operands are not
@@ -1137,32 +1095,23 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
AltPred == CurrentPred || AltPred == SwappedCurrentPred)
continue;
}
- } else if (InstOpcode == Opcode ||
- (IsCopyable && InstOpcode == AltOpcode)) {
- assert((IsCopyable || InstOpcode == AltOpcode) &&
+ } else if (InstOpcode == Opcode) {
+ assert(InstOpcode == AltOpcode &&
"Alternate instructions are only supported by BinaryOperator and "
"CastInst.");
- Instruction *Op = MainOp;
- if (IsCopyable) {
- if (InstOpcode != Opcode && InstOpcode != AltOpcode) {
- Op = I;
- } else if (Opcode != AltOpcode && InstOpcode == AltOpcode) {
- Op = AltOp;
- }
- }
if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
if (Gep->getNumOperands() != 2 ||
- Gep->getOperand(0)->getType() != Op->getOperand(0)->getType())
+ Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
return InstructionsState::invalid();
} else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
if (!isVectorLikeInstWithConstOps(EI))
return InstructionsState::invalid();
} else if (auto *LI = dyn_cast<LoadInst>(I)) {
- auto *BaseLI = cast<LoadInst>(Op);
+ auto *BaseLI = cast<LoadInst>(MainOp);
if (!LI->isSimple() || !BaseLI->isSimple())
return InstructionsState::invalid();
} else if (auto *Call = dyn_cast<CallInst>(I)) {
- auto *CallBase = cast<CallInst>(Op);
+ auto *CallBase = cast<CallInst>(MainOp);
if (Call->getCalledFunction() != CallBase->getCalledFunction())
return InstructionsState::invalid();
if (Call->hasOperandBundles() &&
@@ -1177,14 +1126,13 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
return InstructionsState::invalid();
if (!ID) {
SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
- if (Mappings.size() &&
- (Mappings.size() != BaseMappings.size() ||
- Mappings.front().ISA != BaseMappings.front().ISA ||
- Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
- Mappings.front().VectorName != BaseMappings.front().VectorName ||
- Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
- Mappings.front().Shape.Parameters !=
- BaseMappings.front().Shape.Parameters))
+ if (Mappings.size() != BaseMappings.size() ||
+ Mappings.front().ISA != BaseMappings.front().ISA ||
+ Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
+ Mappings.front().VectorName != BaseMappings.front().VectorName ||
+ Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
+ Mappings.front().Shape.Parameters !=
+ BaseMappings.front().Shape.Parameters)
return InstructionsState::invalid();
}
}
@@ -1196,6 +1144,69 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
return InstructionsState(MainOp, AltOp);
}
+/// \returns analysis of the Instructions in \p VL described in
+/// InstructionsState in propose to vectorize with copyable instructions.
+static InstructionsState getCopyableOpcode(ArrayRef<Value *> VL,
+ const TargetLibraryInfo &TLI) {
+ if (!all_of(VL, IsaPred<Instruction>))
+ return InstructionsState::invalid();
+ Instruction *MainOp = dyn_cast<Instruction>(VL[0]);
+ Instruction *AltOp = nullptr;
+ unsigned Opcode = MainOp->getOpcode();
+ unsigned AltOpcode = Opcode;
+ if (MainOp && VectorizeCopyable && all_of(VL, IsaPred<Instruction>)) {
+ for (Value *V : VL) {
+ Instruction *I = cast<Instruction>(V);
+ if (I->isIntDivRem() || I->isFPDivRem())
+ return InstructionsState::invalid();
+ if (isa<PHINode>(I)) {
+ AltOp = nullptr;
+ break;
+ }
+ unsigned VOpcode = I->getOpcode();
+ if (VOpcode != Opcode) {
+ if (AltOpcode == Opcode) {
+ AltOpcode = VOpcode;
+ AltOp = I;
+ }
+ if (VOpcode != AltOpcode) {
+ AltOp = nullptr;
+ break;
+ }
+ }
+ }
+ if (AltOp) {
+ bool IsBinOp = isa<BinaryOperator>(MainOp);
+ bool IsAltBinOp = isa<BinaryOperator>(AltOp);
+ if (!IsBinOp && IsAltBinOp) {
+ std::swap(MainOp, AltOp);
+ std::swap(IsBinOp, IsAltBinOp);
+ std::swap(Opcode, AltOpcode);
+ }
+ if ((IsBinOp || IsAltBinOp) && !(IsBinOp && IsAltBinOp) &&
+ isCopyableOp(VL, MainOp, AltOp)) {
+ SmallVector<Value *, 8> MainOps, AltOps;
+ for (Value *V : VL) {
+ Instruction *I = cast<Instruction>(V);
+ if (I->getOpcode() == Opcode)
+ MainOps.push_back(I);
+ else
+ AltOps.push_back(I);
+ }
+ if (getSameOpcode(MainOps, TLI) && getSameOpcode(AltOps, TLI))
+ return InstructionsState(MainOp, AltOp);
+ }
+ }
+ }
+ return InstructionsState::invalid();
+}
+
+static InstructionsState getCombinedOpcode(ArrayRef<Value *> VL,
+ const TargetLibraryInfo &TLI) {
+ InstructionsState S = getSameOpcode(VL, TLI);
+ return (S) ? S : getCopyableOpcode(VL, TLI);
+}
+
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
@@ -1233,37 +1244,7 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
}
}
-static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
- const InstructionsState &S) {
- SmallSet<Value *, 4> Ops;
- SmallSet<Value *, 4> AltOps;
- unsigned Opcode = S.getOpcode();
- for (Value *V : VL) {
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- continue;
- if (I->getOpcode() == Opcode)
- Ops.insert(V);
- else
- AltOps.insert(V);
- }
- for (Value *V : VL) {
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- continue;
- for (Use &U : I->operands())
- if (auto *Op = dyn_cast<Instruction>(U.get()); Op && Ops.contains(Op))
- return false;
- if (I->getOpcode() != Opcode) {
- for (Use &U : I->operands())
- if (auto *Op = dyn_cast<Instruction>(U.get());
- Op && AltOps.contains(Op))
- return false;
- }
- }
- return true;
-}
-
+// Determine that the vector could be vectorized with copyable elements.
static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) {
if (any_of(VL, IsaPred<PoisonValue>) || Main == Alt ||
!isa<BinaryOperator>(Main) || !isa<Instruction>(Alt) ||
@@ -1281,6 +1262,7 @@ static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) {
return true;
return false;
}
+
/// \returns the AA location that is being access by the instruction.
static MemoryLocation getLocation(Instruction *I) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -3526,6 +3508,9 @@ class BoUpSLP {
/// Interleaving factor for interleaved loads Vectorize nodes.
unsigned InterleaveFactor = 0;
+ /// True if alterative operation is a copy instruction.
+ bool IsAltOpCopy = false;
+
public:
/// Returns interleave factor for interleave nodes.
unsigned getInterleaveFactor() const { return InterleaveFactor; }
@@ -3581,7 +3566,7 @@ class BoUpSLP {
}
/// Some of the instructions in the list have alternate opcodes.
- bool isAltShuffle() const { return S.isAltShuffle() && !S.isAltOpCopy(); }
+ bool isAltShuffle() const { return S.isAltShuffle() && !IsAltOpCopy; }
bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
@@ -3609,7 +3594,9 @@ class BoUpSLP {
unsigned getAltOpcode() const { return S.getAltOpcode(); }
- bool isAltOpCopy() const { return S.isAltOpCopy(); }
+ bool isAltOpCopy() const { return IsAltOpCopy; }
+
+ void setAltOpCopy(bool Val) { IsAltOpCopy = Val; }
bool hasState() const { return S.valid(); }
@@ -3710,7 +3697,7 @@ class BoUpSLP {
if (S) {
dbgs() << "MainOp: " << *S.getMainOp() << "\n";
dbgs() << "AltOp: " << *S.getAltOp() << "\n";
- dbgs() << "isAltOpCopy: " << S.isAltOpCopy() << "\n";
+ dbgs() << "IsAltOpCopy: " << IsAltOpCopy << "\n";
} else {
dbgs() << "MainOp: NULL\n";
dbgs() << "AltOp: NULL\n";
@@ -3764,29 +3751,27 @@ class BoUpSLP {
#endif
/// Create a new VectorizableTree entry.
- TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
- std::optional<ScheduleData *> Bundle,
- const InstructionsState &S,
- const EdgeInfo &UserTreeIdx,
- ArrayRef<int> ReuseShuffleIndices = {},
- ArrayRef<unsigned> ReorderIndices = {},
- unsigned InterleaveFactor = 0) {
+ TreeEntry *
+ newTreeEntry(ArrayRef<Value *> VL, std::optional<ScheduleData *> Bundle,
+ const InstructionsState &S, const EdgeInfo &UserTreeIdx,
+ ArrayRef<int> ReuseShuffleIndices = {},
+ ArrayRef<unsigned> ReorderIndices = {},
+ unsigned InterleaveFactor = 0, bool IsAltOpCopy = false) {
TreeEntry::EntryState EntryState =
Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
- TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
- ReuseShuffleIndices, ReorderIndices);
+ TreeEntry *E =
+ newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
+ ReuseShuffleIndices, ReorderIndices, IsAltOpCopy);
if (E && InterleaveFactor > 0)
E->setInterleave(InterleaveFactor);
return E;
}
- TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
- TreeEntry::EntryState EntryState,
- std::optional<ScheduleData *> Bundle,
- const InstructionsState &S,
- const EdgeInfo &UserTreeIdx,
- ArrayRef<int> ReuseShuffleIndices = {},
- ArrayRef<unsigned> ReorderIndices = {}) {
+ TreeEntry *newTreeEntry(
+ ArrayRef<Value *> VL, TreeEntry::EntryState EntryState,
+ std::optional<ScheduleData *> Bundle, const InstructionsState &S,
+ const EdgeInfo &UserTreeIdx, ArrayRef<int> ReuseShuffleIndices = {},
+ ArrayRef<unsigned> ReorderIndices = {}, bool IsAltOpCopy = false) {
assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
(Bundle && EntryState != TreeEntry::NeedToGather)) &&
"Need to vectorize gather entry?");
@@ -3804,7 +3789,7 @@ class BoUpSLP {
// for non-power-of-two vectors.
assert(
(hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
- S.isAltOpCopy() || ReuseShuffleIndices.empty()) &&
+ ReuseShuffleIndices.empty()) &&
"Reshuffling scalars not yet supported for nodes with padding");
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
@@ -3829,6 +3814,8 @@ class BoUpSLP {
if (!Last->isGather()) {
SmallPtrSet<Value *, 4> Processed;
unsigned Opcode = S.getOpcode();
+ if (IsAltOpCopy)
+ Last->setAltOpCopy(true);
for (unsigned i = 0; i < VL.size(); ++i) {
Value *V = VL[i];
if (isa<PoisonValue>(V))
@@ -3841,7 +3828,7 @@ class BoUpSLP {
doesNotNeedToBeScheduled(V)) &&
"Scalar already in tree!");
bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
- if (S.isAltOpCopy() && IsAltInst) {
+ if (IsAltOpCopy && IsAltInst) {
CopyableAltOp[V] = Last;
continue;
}
@@ -4418,17 +4405,16 @@ class BoUpSLP {
DecrUnsched(I);
// Handle a copy instruction dependencies.
if (TE && TE->isAltOpCopy() && BundleMember->IsCopy) {
- doForAllOpcodes(BundleMember->Inst, [BundleMember, &ReadyList](
- ScheduleData *CopyUse) {
- if (BundleMember != CopyUse && CopyUse->hasValidDependencies() &&
+ doForAllOpcodes(BundleMember->Inst, [BundleMember, &ReadyList](ScheduleData *CopyUse) {
+ if (BundleMember != CopyUse &&
+ CopyUse->hasValidDependencies() &&
CopyUse->incrementUnscheduledDeps(-1) == 0) {
ScheduleData *DepBundle = CopyUse->FirstInBundle;
assert(!DepBundle->IsScheduled &&
- "already scheduled bundle gets ready");
+ "already scheduled bundle gets ready");
if (DepBundle->isReady()) {
ReadyList.insert(DepBundle);
- LLVM_DEBUG(dbgs() << "SLP: gets ready (copyable): "
- << *DepBundle << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: gets ready (copyable): " << *DepBundle << "\n");
}
}
});
@@ -4519,7 +4505,8 @@ class BoUpSLP {
/// Build a bundle from the ScheduleData nodes corresponding to the
/// scalar instruction for each lane.
ScheduleData *buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
- const InstructionsState &S, bool &ReSchedule);
+ const InstructionsState &S, bool &ReSchedule,
+ bool IsAltOpCopy);
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
@@ -4529,7 +4516,8 @@ class BoUpSLP {
std::optional<ScheduleData *> tryScheduleBundle(ArrayRef<Value *> VL,
BoUpSLP *SLP,
const InstructionsState &S,
- bool AnyCopies);
+ bool AnyCopies,
+ bool IsAltOpCopy);
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL, ScheduleData *Bundle);
@@ -7937,8 +7925,6 @@ bool BoUpSLP::canRepresentAsCopyable(const InstructionsState &S,
SmallVector<unsigned> MainAltOps;
unsigned Operand;
- if (!checkCopyableInnerDep(VL, S))
- return false;
if (isCopyableOp(VL, S.getMainOp(), S.getAltOp()))
return true;
if ((isa<BinaryOperator>(S.getMainOp()) && Opcode0 == Opcode1) ||
@@ -8507,11 +8493,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
return TreeEntry::Vectorize;
}
case Instruction::ShuffleVector: {
- if (VectorizeCopyable && isCopyableOp(VL, S.getMainOp(), S.getAltOp()) &&
- checkCopyableInnerDep(VL, S)) {
- S.setAltOpCopy(true);
- return TreeEntry::Vectorize;
- }
if (!S.isAltShuffle()) {
// REVEC can support non alternate shuffle.
if (SLPReVec && getShufflevectorNumGroups(VL))
@@ -8528,9 +8509,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
"the whole alt sequence is not profitable.\n");
return TreeEntry::NeedToGather;
}
- if (VectorizeCopyable && canRepresentAsCopyable(S, VL))
- S.setAltOpCopy(true);
-
return TreeEntry::Vectorize;
}
default:
@@ -8708,6 +8686,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
};
InstructionsState S = getSameOpcode(VL, *TLI);
+ bool IsAltOpCopy = false;
+ if (!S && VectorizeCopyable) {
+ S = getCopyableOpcode(VL, *TLI);
+ if (S) {
+ if (isCopyableOp(VL, S.getMainOp(), S.getAltOp()) ||
+ canRepresentAsCopyable(S, VL)) {
+ IsAltOpCopy = true;
+ } else {
+ S = InstructionsState::invalid();
+ }
+ }
+ } else if (VectorizeCopyable && canRepresentAsCopyable(S, VL))
+ IsAltOpCopy = true;
// Don't go into catchswitch blocks, which can happen with PHIs.
// Such blocks can only have PHIs and the catchswitch. There is no
@@ -8954,7 +8945,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<Value *> PointerOps;
TreeEntry::EntryState State = getScalarsVectorizationState(
S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
- if (S.isAltOpCopy()) {
+ if (IsAltOpCopy) {
+ State = TreeEntry::Vectorize;
for (Value *V : VL) {
Instruction *I = dyn_cast<Instruction>(V);
if (!I)
@@ -8967,7 +8959,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
}
if (State == TreeEntry::NeedToGather ||
- (S.isAltOpCopy() && !has_single_bit(UniqueValues.size()))) {
+ (IsAltOpCopy && !has_single_bit(UniqueValues.size()))) {
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
return;
@@ -8985,15 +8977,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BlockScheduling &BS = *BSRef;
std::optional<ScheduleData *> Bundle;
- Bundle = BS.tryScheduleBundle(UniqueValues, this, S, AnyCopies);
+ Bundle = BS.tryScheduleBundle(UniqueValues, this, S, AnyCopies, IsAltOpCopy);
#ifdef EXPENSIVE_CHECKS
// Make sure we didn't break any internal invariants
BS.verify();
#endif
- if (!Bundle || (S.isAltOpCopy() && !Bundle.value())) {
+ if (!Bundle || (IsAltOpCopy && !Bundle.value())) {
LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL0) ||
- !BS.getScheduleData(VL0)->isPartOfBundle() || S.isAltOpCopy() ||
+ !BS.getScheduleData(VL0)->isPartOfBundle() ||
(BS.getScheduleData(VL0)->TE &&
BS.getScheduleData(VL0)->TE->isAltOpCopy())) &&
"tryScheduleBundle should cancelScheduling on failure");
@@ -9386,7 +9378,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
case Instruction::ShuffleVector: {
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
+ ReuseShuffleIndices, {}, 0, IsAltOpCopy);
if (S.isAltShuffle()) {
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
TE->dump());
@@ -9396,7 +9388,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
TE->dump());
}
- if (S.isAltOpCopy() && !isCopyableOp(VL, S.getMainOp(), S.getAltOp())) {
+ if (IsAltOpCopy && !isCopyableOp(VL, S.getMainOp(), S.getAltOp())) {
ValueList Left, Right;
unsigned Opcode0 = S.getOpcode();
unsigned Opcode1 = S.getAltOpcode();
@@ -9440,7 +9432,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
buildTree_rec(newVL, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
return;
- } else if (S.isAltOpCopy()) {
+ } else if (IsAltOpCopy) {
ValueList Left, Right;
unsigned Opcode0 = S.getOpcode();
VLOperands Ops(VL, S, *this);
@@ -11622,12 +11614,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
unsigned Idx) const {
ArrayRef<Value *> VL = E->getOperand(Idx);
- InstructionsState S = getSameOpcode(VL, *TLI);
+ InstructionsState S = getCombinedOpcode(VL, *TLI);
// Special processing for GEPs bundle, which may include non-gep values.
if (!S && VL.front()->getType()->isPointerTy()) {
const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
if (It != VL.end())
- S = getSameOpcode(*It, *TLI);
+ S = getCombinedOpcode(*It, *TLI);
}
if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx, VL, S))
return VE;
@@ -13166,8 +13158,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
std::optional<unsigned> InsertIdx = getElementIndex(VU);
if (InsertIdx) {
const TreeEntry *ScalarTE = &EU.E;
- if (!ScalarTE)
- continue;
auto *It = find_if(
ShuffledInserts,
[this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
@@ -15285,12 +15275,12 @@ BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx,
Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
ValueList &VL = E->getOperand(NodeIdx);
- InstructionsState S = getSameOpcode(VL, *TLI);
+ InstructionsState S = getCombinedOpcode(VL, *TLI);
// Special processing for GEPs bundle, which may include non-gep values.
if (!S && VL.front()->getType()->isPointerTy()) {
const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
if (It != VL.end())
- S = getSameOpcode(*It, *TLI);
+ S = getCombinedOpcode(*It, *TLI);
}
const unsigned VF = VL.size();
if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx, VL, S)) {
@@ -17629,13 +17619,15 @@ void BoUpSLP::optimizeGatherSequence() {
BoUpSLP::ScheduleData *
BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S,
- bool &ReSchedule) {
+ bool &ReSchedule, bool IsAltOpCopy) {
ScheduleData *Bundle = nullptr;
ScheduleData *PrevInBundle = nullptr;
unsigned Opcode = S.getOpcode();
ValueList Keys;
for (Value *V : VL) {
+ if (doesNotNeedToBeScheduled(V) && IsAltOpCopy)
+ return nullptr;
auto *SD = getScheduleData(V);
bool FoundKey = false;
if (SD && !SD->isPartOfBundle()) {
@@ -17694,7 +17686,7 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// Group the instructions to a bundle.
BundleMember->FirstInBundle = Bundle;
- if (S.isAltOpCopy() && IsAltInst)
+ if (IsAltOpCopy && IsAltInst)
BundleMember->IsCopy = true;
PrevInBundle = BundleMember;
if (SLP->CopyableAltOp.contains(I)) {
@@ -17714,7 +17706,7 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
std::optional<BoUpSLP::ScheduleData *>
BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S,
- bool AnyCopies) {
+ bool AnyCopies, bool IsAltOpCopy) {
// No need to schedule PHIs, insertelement, extractelement and extractvalue
// instructions.
if (isa<PHINode>(S.getMainOp()) ||
@@ -17723,7 +17715,6 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// Initialize the instruction bundle.
Instruction *OldScheduleEnd = ScheduleEnd;
- bool IsAltOpCopy = S.isAltOpCopy();
LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
auto TryScheduleBundleImpl = [this, OldScheduleEnd, IsAltOpCopy, AnyCopies,
@@ -17801,7 +17792,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
ReSchedule = true;
}
- auto *Bundle = buildBundle(VL, SLP, S, ReSchedule);
+ auto *Bundle = buildBundle(VL, SLP, S, ReSchedule, IsAltOpCopy);
if (!Bundle)
return std::nullopt;
TryScheduleBundleImpl(ReSchedule, Bundle);
@@ -18051,9 +18042,20 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
if (auto *I = dyn_cast<Instruction>(U)) {
doForAllOpcodes(I, [&](ScheduleData *UseSD) {
ScheduleData *DestBundle = UseSD->FirstInBundle;
- if ((UseSD->IsCopy || BundleMember->IsCopy) &&
+ if (BundleMember->IsCopy && !UseSD->IsCopy &&
DestBundle == BundleMember->FirstInBundle)
return;
+ // For copy operations check for inner dependencies, we could not
+ // support such depenedies if it comes from a main operaion, only
+ // from alternative or for now we ignore alternative operations
+ // depenedies to any alternative.
+ if (BundleMember->TE && BundleMember->TE->isAltOpCopy() &&
+ (!BundleMember->IsCopy ||
+ (BundleMember->IsCopy && UseSD->IsCopy)) &&
+ DestBundle == BundleMember->FirstInBundle) {
+ BundleMember->Dependencies++;
+ BundleMember->incrementUnscheduledDeps(1);
+ }
BundleMember->Dependencies++;
if (!DestBundle->IsScheduled)
BundleMember->incrementUnscheduledDeps(1);
@@ -18243,15 +18245,13 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
}
};
std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
- SmallDenseMap<Value *, ScheduleData *> CopyElementsMap;
// Ensure that all dependency data is updated (for nodes in the sub-graph)
// and fill the ready-list with initial instructions.
int Idx = 0;
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
- BS->doForAllOpcodes(I, [this, &Idx, &CopyElementsMap,
- BS](ScheduleData *SD) {
+ BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
[[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(SD->Inst);
assert((isVectorLikeInstWithConstOps(SD->Inst) || SD->IsCopy ||
SD->isPartOfBundle() ==
@@ -18259,16 +18259,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
!doesNotNeedToSchedule(SDTEs.front()->Scalars))) &&
"scheduler and vectorizer bundle mismatch");
SD->FirstInBundle->SchedulingPriority = Idx++;
- for (TreeEntry *SDTE : SDTEs)
- if (SDTE && SDTE->isAltOpCopy()) {
- ScheduleData *Bundle = SD->FirstInBundle;
- for (ScheduleData *BundleMember = Bundle; BundleMember;
- BundleMember = BundleMember->NextInBundle) {
- if (BundleMember->IsCopy)
- CopyElementsMap[BundleMember->Inst] = Bundle;
- }
- }
-
if (SD->isSchedulingEntity() && SD->isPartOfBundle())
BS->calculateDependencies(SD, false, this);
});
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 7fa746dc758a9..917ad682e26cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -63,9 +63,18 @@ define void @add1(ptr noalias %dst, ptr noalias %src) {
;
; COPYABLE-LABEL: @add1(
; COPYABLE-NEXT: entry:
-; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
-; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; COPYABLE-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; COPYABLE-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 1, i32 2>
+; COPYABLE-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; COPYABLE-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
+; COPYABLE-NEXT: store i32 [[ADD9]], ptr [[INCDEC_PTR7]], align 4
; COPYABLE-NEXT: ret void
;
entry:
@@ -90,44 +99,21 @@ entry:
}
define void @sub0(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @sub0(
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; NON-POW2-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; NON-POW2-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; NON-POW2-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; NON-POW2-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
-; NON-POW2-NEXT: ret void
-;
-; POW2-ONLY-LABEL: @sub0(
-; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; POW2-ONLY-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; POW2-ONLY-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
-; POW2-ONLY-NEXT: ret void
-;
-; COPYABLE-LABEL: @sub0(
-; COPYABLE-NEXT: entry:
-; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 -1, i32 0, i32 -2, i32 -3>
-; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
-; COPYABLE-NEXT: ret void
+; CHECK-LABEL: @sub0(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
+; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -211,55 +197,23 @@ entry:
}
define void @addsub0(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @addsub0(
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; NON-POW2-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; NON-POW2-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; NON-POW2-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; NON-POW2-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; NON-POW2-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; NON-POW2-NEXT: ret void
-;
-; POW2-ONLY-LABEL: @addsub0(
-; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; POW2-ONLY-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; POW2-ONLY-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; POW2-ONLY-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; POW2-ONLY-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; POW2-ONLY-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; POW2-ONLY-NEXT: ret void
-;
-; COPYABLE-LABEL: @addsub0(
-; COPYABLE-NEXT: entry:
-; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; COPYABLE-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; COPYABLE-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; COPYABLE-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 0>
-; COPYABLE-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
-; COPYABLE-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; COPYABLE-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; COPYABLE-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; COPYABLE-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; COPYABLE-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; COPYABLE-NEXT: ret void
+; CHECK-LABEL: @addsub0(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
+; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -283,55 +237,23 @@ entry:
}
define void @addsub1(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @addsub1(
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; NON-POW2-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; NON-POW2-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; NON-POW2-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; NON-POW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; NON-POW2-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; NON-POW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
-; NON-POW2-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; NON-POW2-NEXT: ret void
-;
-; POW2-ONLY-LABEL: @addsub1(
-; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; POW2-ONLY-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
-; POW2-ONLY-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; POW2-ONLY-NEXT: ret void
-;
-; COPYABLE-LABEL: @addsub1(
-; COPYABLE-NEXT: entry:
-; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; COPYABLE-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; COPYABLE-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; COPYABLE-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; COPYABLE-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; COPYABLE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; COPYABLE-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
-; COPYABLE-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; COPYABLE-NEXT: [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP4]], <i32 0, i32 -3>
-; COPYABLE-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; COPYABLE-NEXT: ret void
+; CHECK-LABEL: @addsub1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; CHECK-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
+; CHECK-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; CHECK-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -355,44 +277,21 @@ entry:
}
define void @mul(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @mul(
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
-; NON-POW2-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; NON-POW2-NEXT: [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; NON-POW2-NEXT: store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
-; NON-POW2-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
-; NON-POW2-NEXT: store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
-; NON-POW2-NEXT: ret void
-;
-; POW2-ONLY-LABEL: @mul(
-; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
-; POW2-ONLY-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT: store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
-; POW2-ONLY-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
-; POW2-ONLY-NEXT: store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
-; POW2-ONLY-NEXT: ret void
-;
-; COPYABLE-LABEL: @mul(
-; COPYABLE-NEXT: entry:
-; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[TMP0]], <i32 257, i32 -3, i32 1, i32 -9>
-; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
-; COPYABLE-NEXT: ret void
+; CHECK-LABEL: @mul(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
+; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; CHECK-NEXT: store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; CHECK-NEXT: store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
+; CHECK-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -445,9 +344,18 @@ define void @shl0(ptr noalias %dst, ptr noalias %src) {
;
; COPYABLE-LABEL: @shl0(
; COPYABLE-NEXT: entry:
-; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
-; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; COPYABLE-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; COPYABLE-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 2>
+; COPYABLE-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; COPYABLE-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3
+; COPYABLE-NEXT: store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
; COPYABLE-NEXT: ret void
;
entry:
@@ -561,9 +469,18 @@ define void @add1f(ptr noalias %dst, ptr noalias %src) {
;
; COPYABLE-LABEL: @add1f(
; COPYABLE-NEXT: entry:
-; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; COPYABLE-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT: store float [[TMP0]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; COPYABLE-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; COPYABLE-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; COPYABLE-NEXT: store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; COPYABLE-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; COPYABLE-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
; COPYABLE-NEXT: ret void
;
entry:
@@ -588,44 +505,21 @@ entry:
}
define void @sub0f(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @sub0f(
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; NON-POW2-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT: store float [[ADD]], ptr [[DST]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; NON-POW2-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; NON-POW2-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT: ret void
-;
-; POW2-ONLY-LABEL: @sub0f(
-; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT: store float [[ADD]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; POW2-ONLY-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; POW2-ONLY-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; POW2-ONLY-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT: ret void
-;
-; COPYABLE-LABEL: @sub0f(
-; COPYABLE-NEXT: entry:
-; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
-; COPYABLE-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; COPYABLE-NEXT: ret void
+; CHECK-LABEL: @sub0f(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT: store float [[ADD]], ptr [[DST]], align 4
+; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -709,55 +603,23 @@ entry:
}
define void @addsub0f(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @addsub0f(
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; NON-POW2-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT: store float [[SUB]], ptr [[DST]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; NON-POW2-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; NON-POW2-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; NON-POW2-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; NON-POW2-NEXT: ret void
-;
-; POW2-ONLY-LABEL: @addsub0f(
-; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT: store float [[SUB]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; POW2-ONLY-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; POW2-ONLY-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; POW2-ONLY-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; POW2-ONLY-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; POW2-ONLY-NEXT: ret void
-;
-; COPYABLE-LABEL: @addsub0f(
-; COPYABLE-NEXT: entry:
-; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; COPYABLE-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; COPYABLE-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; COPYABLE-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00>
-; COPYABLE-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; COPYABLE-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; COPYABLE-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; COPYABLE-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; COPYABLE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; COPYABLE-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; COPYABLE-NEXT: ret void
+; CHECK-LABEL: @addsub0f(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT: store float [[SUB]], ptr [[DST]], align 4
+; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -781,55 +643,23 @@ entry:
}
define void @addsub1f(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @addsub1f(
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; NON-POW2-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; NON-POW2-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; NON-POW2-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; NON-POW2-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; NON-POW2-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; NON-POW2-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
-; NON-POW2-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; NON-POW2-NEXT: ret void
-;
-; POW2-ONLY-LABEL: @addsub1f(
-; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; POW2-ONLY-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; POW2-ONLY-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
-; POW2-ONLY-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; POW2-ONLY-NEXT: ret void
-;
-; COPYABLE-LABEL: @addsub1f(
-; COPYABLE-NEXT: entry:
-; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; COPYABLE-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; COPYABLE-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; COPYABLE-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; COPYABLE-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; COPYABLE-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; COPYABLE-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; COPYABLE-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; COPYABLE-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP4]], <float 0.000000e+00, float -3.000000e+00>
-; COPYABLE-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; COPYABLE-NEXT: ret void
+; CHECK-LABEL: @addsub1f(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
+; CHECK-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; CHECK-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -853,44 +683,21 @@ entry:
}
define void @mulf(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @mulf(
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; NON-POW2-NEXT: [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; NON-POW2-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; NON-POW2-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; NON-POW2-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; NON-POW2-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; NON-POW2-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
-; NON-POW2-NEXT: ret void
-;
-; POW2-ONLY-LABEL: @mulf(
-; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT: [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; POW2-ONLY-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; POW2-ONLY-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
-; POW2-ONLY-NEXT: ret void
-;
-; COPYABLE-LABEL: @mulf(
-; COPYABLE-NEXT: entry:
-; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; COPYABLE-NEXT: [[TMP1:%.*]] = fmul fast <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
-; COPYABLE-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; COPYABLE-NEXT: ret void
+; CHECK-LABEL: @mulf(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; CHECK-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -1107,49 +914,21 @@ entry:
}
define void @mulfn(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @mulfn(
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; NON-POW2-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; NON-POW2-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; NON-POW2-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; NON-POW2-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; NON-POW2-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; NON-POW2-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; NON-POW2-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; NON-POW2-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
-; NON-POW2-NEXT: ret void
-;
-; POW2-ONLY-LABEL: @mulfn(
-; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; POW2-ONLY-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; POW2-ONLY-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
-; POW2-ONLY-NEXT: ret void
-;
-; COPYABLE-LABEL: @mulfn(
-; COPYABLE-NEXT: entry:
-; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; COPYABLE-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; COPYABLE-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; COPYABLE-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; COPYABLE-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; COPYABLE-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; COPYABLE-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], <float 1.000000e+00, float -9.000000e+00>
-; COPYABLE-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
-; COPYABLE-NEXT: ret void
+; CHECK-LABEL: @mulfn(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; CHECK-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
More information about the llvm-commits
mailing list