[llvm] [WIP][SLP] SLP's copyable elements based upon Main/Alt operations. (PR #124242)
Dinar Temirbulatov via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 24 00:37:16 PST 2025
https://github.com/dtemirbulatov created https://github.com/llvm/llvm-project/pull/124242
Added testcase from https://github.com/llvm/llvm-project/issues/110740.
Still there are several issues with this change that can be reproduced with LNT by adding "-mllvm -slp-vectorize-copyable=true -mllvm -slp-threshold=-99999" and demote values support, float operations support with "fast-math".
>From 6b92c26d790e0172be9df8f2f034e3f2d7cd0a8e Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dtemirbulatov at gmail.com>
Date: Sat, 18 Jan 2025 21:01:52 +0000
Subject: [PATCH] [WIP][SLP] SLP's copyable elements based upon Main/Alt
operations.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 763 +++++++++++++++---
.../X86/vect_copyable_in_binops.ll | 534 +++++++++---
2 files changed, 1084 insertions(+), 213 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c98d872fb6467f..47b61496b5e155 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -201,6 +201,10 @@ static cl::opt<bool> VectorizeNonPowerOf2(
"slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
cl::desc("Try to vectorize with non-power-of-2 number of elements."));
+static cl::opt<bool>
+ VectorizeCopyable("slp-vectorize-copyable", cl::init(false), cl::Hidden,
+ cl::desc("Try to vectorize with copyable elements."));
+
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
@@ -426,6 +430,8 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
if (isa<ExtractElementInst>(I))
return isConstant(I->getOperand(1));
assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
+ if (I->getNumOperands() < 2)
+ return false;
return isConstant(I->getOperand(2));
}
@@ -594,6 +600,41 @@ static std::optional<unsigned> getElementIndex(const Value *Inst,
return Index;
}
+/// Checks if the \p Opcode can be considered as an operand of a (possibly)
+/// binary operation \p I.
+/// \returns The code of the binary operation of instruction \p I if the
+/// instruction with \p Opcode can be considered as an operand of \p I with the
+/// default value.
+static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) {
+ if (Opcode != Instruction::PHI && Opcode != Instruction::Invoke &&
+ !isa<FPMathOperator>(I) &&
+ ((I->getType()->isIntegerTy() &&
+ (I->getOpcode() == Instruction::Add ||
+ I->getOpcode() == Instruction::And ||
+ I->getOpcode() == Instruction::AShr ||
+ I->getOpcode() == Instruction::BitCast ||
+ I->getOpcode() == Instruction::Call ||
+ // Issue with scheduling with isVectorLikeInstWithConstOps
+ // operations.
+ // I->getOpcode() == Instruction::ExtractElement ||
+ // I->getOpcode() == Instruction::ExtractValue ||
+ I->getOpcode() == Instruction::ICmp ||
+ I->getOpcode() == Instruction::Load ||
+ I->getOpcode() == Instruction::LShr ||
+ I->getOpcode() == Instruction::Mul ||
+ I->getOpcode() == Instruction::Or ||
+ I->getOpcode() == Instruction::PtrToInt ||
+ I->getOpcode() == Instruction::Select ||
+ I->getOpcode() == Instruction::SExt ||
+ I->getOpcode() == Instruction::Shl ||
+ I->getOpcode() == Instruction::Sub ||
+ I->getOpcode() == Instruction::Trunc ||
+ I->getOpcode() == Instruction::Xor ||
+ I->getOpcode() == Instruction::ZExt))))
+ return I->getOpcode();
+ return 0;
+}
+
namespace {
/// Specifies the way the mask should be analyzed for undefs/poisonous elements
/// in the shuffle mask.
@@ -853,6 +894,16 @@ class InstructionsState {
} // end anonymous namespace
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(const InstructionsState &S, Value *Op) {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (I && S.isOpcodeOrAlt(I))
+ return Op;
+ return S.getMainOp();
+}
+
/// \returns true if \p Opcode is allowed as part of the main/alternate
/// instruction for SLP vectorization.
///
@@ -865,6 +916,14 @@ static bool isValidForAlternation(unsigned Opcode) {
return true;
}
+// Check for inner dependencies, we could not support such depenedies if it
+// comes from a main operaion, only from an alternative.
+static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
+ const InstructionsState &S);
+
+// Determine that the vector could be vectorized with copyable elements.
+static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt);
+
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
const TargetLibraryInfo &TLI);
@@ -917,19 +976,53 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
return InstructionsState::invalid();
Instruction *MainOp = cast<Instruction>(*It);
+ Instruction *AltOp = MainOp;
+ unsigned Opcode = MainOp->getOpcode();
+ unsigned AltOpcode = Opcode;
+ for (Value *V : iterator_range(It + 1, VL.end())) {
+ Instruction *Inst = dyn_cast<Instruction>(V);
+ if (!Inst)
+ continue;
+ unsigned VOpcode = Inst->getOpcode();
+ if (Inst && AltOpcode == Opcode && !isa<PHINode>(Inst) &&
+ VOpcode != Opcode && isValidForAlternation(VOpcode)) {
+ AltOpcode = VOpcode;
+ AltOp = Inst;
+ break;
+ }
+ }
unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
(VL.size() == 2 && InstCnt < 2))
return InstructionsState::invalid();
+ bool IsBinOp = isa<BinaryOperator>(MainOp);
+ bool IsCopyable = false;
+ if (MainOp && AltOp && MainOp != AltOp) {
+ if (!IsBinOp && isa<BinaryOperator>(AltOp) && !isa<PHINode>(MainOp)) {
+ std::swap(MainOp, AltOp);
+ std::swap(AltOpcode, Opcode);
+ IsBinOp = true;
+ }
+ IsCopyable = VectorizeCopyable && isCopyableOp(VL, MainOp, AltOp);
+ if (IsCopyable && isa<CmpInst>(AltOp)) {
+ Type *Ty0 = MainOp->getOperand(0)->getType();
+ Type *Ty1 = AltOp->getOperand(0)->getType();
+ if (Ty0 != Ty1)
+ return InstructionsState::invalid();
+ }
+ if (!IsCopyable) {
+ MainOp = cast<Instruction>(*It);
+ AltOp = MainOp;
+ Opcode = MainOp->getOpcode();
+ AltOpcode = Opcode;
+ IsBinOp = isa<BinaryOperator>(MainOp);
+ }
+ }
bool IsCastOp = isa<CastInst>(MainOp);
- bool IsBinOp = isa<BinaryOperator>(MainOp);
bool IsCmpOp = isa<CmpInst>(MainOp);
CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
: CmpInst::BAD_ICMP_PREDICATE;
- Instruction *AltOp = MainOp;
- unsigned Opcode = MainOp->getOpcode();
- unsigned AltOpcode = Opcode;
bool SwappedPredsCompatible = IsCmpOp && [&]() {
SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
@@ -984,7 +1077,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
AltOp = I;
continue;
}
- } else if (IsCastOp && isa<CastInst>(I)) {
+ } else if ((IsCastOp || IsCopyable) && isa<CastInst>(I)) {
Value *Op0 = MainOp->getOperand(0);
Type *Ty0 = Op0->getType();
Value *Op1 = I->getOperand(0);
@@ -1001,13 +1094,15 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
continue;
}
}
- } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
+ } else if (auto *Inst = dyn_cast<CmpInst>(I);
+ Inst && (IsCmpOp || IsCopyable)) {
auto *BaseInst = cast<CmpInst>(MainOp);
Type *Ty0 = BaseInst->getOperand(0)->getType();
Type *Ty1 = Inst->getOperand(0)->getType();
if (Ty0 == Ty1) {
- assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
- assert(InstOpcode == AltOpcode &&
+ assert((IsCopyable || InstOpcode == Opcode) &&
+ "Expected same CmpInst opcode.");
+ assert((IsCopyable || InstOpcode == AltOpcode) &&
"Alternate instructions are only supported by BinaryOperator "
"and CastInst.");
// Check for compatible operands. If the corresponding operands are not
@@ -1038,23 +1133,32 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
AltPred == CurrentPred || AltPred == SwappedCurrentPred)
continue;
}
- } else if (InstOpcode == Opcode) {
- assert(InstOpcode == AltOpcode &&
+ } else if (InstOpcode == Opcode ||
+ (IsCopyable && InstOpcode == AltOpcode)) {
+ assert((IsCopyable || InstOpcode == AltOpcode) &&
"Alternate instructions are only supported by BinaryOperator and "
"CastInst.");
+ Instruction *Op = MainOp;
+ if (IsCopyable) {
+ if (InstOpcode != Opcode && InstOpcode != AltOpcode) {
+ Op = I;
+ } else if (Opcode != AltOpcode && InstOpcode == AltOpcode) {
+ Op = AltOp;
+ }
+ }
if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
if (Gep->getNumOperands() != 2 ||
- Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
+ Gep->getOperand(0)->getType() != Op->getOperand(0)->getType())
return InstructionsState::invalid();
} else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
if (!isVectorLikeInstWithConstOps(EI))
return InstructionsState::invalid();
} else if (auto *LI = dyn_cast<LoadInst>(I)) {
- auto *BaseLI = cast<LoadInst>(MainOp);
+ auto *BaseLI = cast<LoadInst>(Op);
if (!LI->isSimple() || !BaseLI->isSimple())
return InstructionsState::invalid();
} else if (auto *Call = dyn_cast<CallInst>(I)) {
- auto *CallBase = cast<CallInst>(MainOp);
+ auto *CallBase = cast<CallInst>(Op);
if (Call->getCalledFunction() != CallBase->getCalledFunction())
return InstructionsState::invalid();
if (Call->hasOperandBundles() &&
@@ -1069,13 +1173,14 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
return InstructionsState::invalid();
if (!ID) {
SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
- if (Mappings.size() != BaseMappings.size() ||
- Mappings.front().ISA != BaseMappings.front().ISA ||
- Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
- Mappings.front().VectorName != BaseMappings.front().VectorName ||
- Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
- Mappings.front().Shape.Parameters !=
- BaseMappings.front().Shape.Parameters)
+ if (Mappings.size() &&
+ (Mappings.size() != BaseMappings.size() ||
+ Mappings.front().ISA != BaseMappings.front().ISA ||
+ Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
+ Mappings.front().VectorName != BaseMappings.front().VectorName ||
+ Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
+ Mappings.front().Shape.Parameters !=
+ BaseMappings.front().Shape.Parameters))
return InstructionsState::invalid();
}
}
@@ -1124,6 +1229,46 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
}
}
+static bool checkCopyableInnerDep(ArrayRef<Value *> VL,
+ const InstructionsState &S) {
+ SmallSet<Value *, 4> Ops;
+ unsigned Opcode = S.getOpcode();
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (I->getOpcode() == Opcode)
+ Ops.insert(V);
+ }
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ for (Use &U : I->operands()) {
+ if (auto *Op = dyn_cast<Instruction>(U.get()); Op && Ops.contains(Op))
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool isCopyableOp(ArrayRef<Value *> VL, Value *Main, Value *Alt) {
+ if (any_of(VL, IsaPred<PoisonValue>) || Main == Alt ||
+ !isa<BinaryOperator>(Main) || !isa<Instruction>(Alt) ||
+ find_if(VL, IsaPred<PHINode>) != VL.end())
+ return false;
+
+ Instruction *MainOp = cast<Instruction>(Main);
+ Instruction *AltOp = cast<Instruction>(Alt);
+
+ if (isa<BinaryOperator>(MainOp) && !isa<BinaryOperator>(AltOp) &&
+ isValidForAlternation(MainOp->getOpcode()) &&
+ isValidForAlternation(AltOp->getOpcode()) &&
+ tryToRepresentAsInstArg(MainOp->getOpcode(), AltOp) &&
+ tryToRepresentAsInstArg(AltOp->getOpcode(), MainOp))
+ return true;
+ return false;
+}
/// \returns the AA location that is being access by the instruction.
static MemoryLocation getLocation(Instruction *I) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -1463,6 +1608,7 @@ class BoUpSLP {
MultiNodeScalars.clear();
MustGather.clear();
NonScheduledFirst.clear();
+ CopyableAltOp.clear();
EntryToLastInstruction.clear();
LoadEntriesToVectorize.clear();
IsGraphTransformMode = false;
@@ -2461,8 +2607,16 @@ class BoUpSLP {
}
bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
bool APO = (OpIdx == 0) ? false : IsInverseOperation;
- OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
- APO, false};
+ Instruction *Inst = cast<Instruction>(VL[Lane]);
+ if (Inst->getOpcode() != MainOp->getOpcode() &&
+ OpIdx > (Inst->getNumOperands() - 1)) {
+ OpsVec[OpIdx][Lane] = {
+ PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
+ false};
+ } else {
+ OpsVec[OpIdx][Lane] = {
+ cast<Instruction>(VL[Lane])->getOperand(OpIdx), APO, false};
+ }
}
}
}
@@ -3298,6 +3452,7 @@ class BoUpSLP {
///< complex node like select/cmp to minmax, mul/add to
///< fma, etc. Must be used for the following nodes in
///< the pattern, not the very first one.
+ CopyableVectorize, ///< The node for copyable elements.
};
EntryState State;
@@ -3357,7 +3512,8 @@ class BoUpSLP {
if (Operands.size() < OpIdx + 1)
Operands.resize(OpIdx + 1);
assert(Operands[OpIdx].empty() && "Already resized?");
- assert(OpVL.size() <= Scalars.size() &&
+ assert((State == TreeEntry::CopyableVectorize ||
+ OpVL.size() <= Scalars.size()) &&
"Number of operands is greater than the number of scalars.");
Operands[OpIdx].resize(OpVL.size());
copy(OpVL, Operands[OpIdx].begin());
@@ -3401,7 +3557,9 @@ class BoUpSLP {
}
/// Some of the instructions in the list have alternate opcodes.
- bool isAltShuffle() const { return S.isAltShuffle(); }
+ bool isAltShuffle() const {
+ return S.isAltShuffle() && State != TreeEntry::CopyableVectorize;
+ }
bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
@@ -3524,6 +3682,9 @@ class BoUpSLP {
case CombinedVectorize:
dbgs() << "CombinedVectorize\n";
break;
+ case CopyableVectorize:
+ dbgs() << "CopyableVectorize\n";
+ break;
}
if (S) {
dbgs() << "MainOp: " << *S.getMainOp() << "\n";
@@ -3619,6 +3780,7 @@ class BoUpSLP {
// for non-power-of-two vectors.
assert(
(hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
+ EntryState == TreeEntry::CopyableVectorize ||
ReuseShuffleIndices.empty()) &&
"Reshuffling scalars not yet supported for nodes with padding");
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
@@ -3642,8 +3804,13 @@ class BoUpSLP {
Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
}
if (!Last->isGather()) {
- for (Value *V : VL) {
+ unsigned Opcode = S.getOpcode();
+ for (unsigned i = 0; i < VL.size(); ++i) {
+ Value *V = VL[i];
const TreeEntry *TE = getTreeEntry(V);
+ Instruction *I = dyn_cast<Instruction>(V);
+ bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
+
assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
"Scalar already in tree!");
if (TE) {
@@ -3651,6 +3818,10 @@ class BoUpSLP {
MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
continue;
}
+ if (EntryState == TreeEntry::CopyableVectorize && IsAltInst) {
+ CopyableAltOp.insert(V);
+ continue;
+ }
ScalarToTreeEntry[V] = Last;
}
// Update the scheduler bundle to point to this TreeEntry.
@@ -3725,6 +3896,10 @@ class BoUpSLP {
bool areAltOperandsProfitable(const InstructionsState &S,
ArrayRef<Value *> VL) const;
+ /// Check that we can represent operations as copyable with looking to
+ /// operations operands.
+ bool canRepresentAsCopyable(const InstructionsState &S, ArrayRef<Value *> VL);
+
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
TreeEntry::EntryState
@@ -3746,6 +3921,9 @@ class BoUpSLP {
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
+ /// A set op scalars that we are considoring as copyable operations.
+ ValueSet CopyableAltOp;
+
/// A set of first non-schedulable values.
ValueSet NonScheduledFirst;
@@ -3875,15 +4053,16 @@ class BoUpSLP {
ScheduleData() = default;
- void init(int BlockSchedulingRegionID, Instruction *I) {
+ void init(int BlockSchedulingRegionID, Value *OpVal) {
FirstInBundle = this;
NextInBundle = nullptr;
NextLoadStore = nullptr;
IsScheduled = false;
SchedulingRegionID = BlockSchedulingRegionID;
clearDependencies();
- Inst = I;
+ OpValue = OpVal;
TE = nullptr;
+ IsCopy = false;
}
/// Verify basic self consistency properties
@@ -3990,6 +4169,9 @@ class BoUpSLP {
Instruction *Inst = nullptr;
+ /// Opcode of the current instruction in the schedule data.
+ Value *OpValue = nullptr;
+
/// The TreeEntry that this instruction corresponds to.
TreeEntry *TE = nullptr;
@@ -4037,6 +4219,9 @@ class BoUpSLP {
/// True if this instruction is scheduled (or considered as scheduled in the
/// dry-run).
bool IsScheduled = false;
+
+ /// True if this instruction is copy.
+ bool IsCopy = false;
};
#ifndef NDEBUG
@@ -4106,6 +4291,31 @@ class BoUpSLP {
return nullptr;
}
+ ScheduleData *getScheduleData(Value *V, Value *Key) {
+ auto I = ExtraScheduleDataMap.find(V);
+ if (I != ExtraScheduleDataMap.end()) {
+ ScheduleData *SD = I->second.lookup(Key);
+ if (SD && isInSchedulingRegion(SD))
+ return SD;
+ }
+ if (V == Key)
+ return getScheduleData(V);
+ return nullptr;
+ }
+
+ ScheduleData *getScheduleData(Value *V, const TreeEntry *E) {
+ ScheduleData *SD = getScheduleData(V);
+ if (SD && isInSchedulingRegion(SD) && SD->TE == E)
+ return SD;
+ auto I = ExtraScheduleDataMap.find(V);
+ if (I == ExtraScheduleDataMap.end())
+ return nullptr;
+ for (auto &P : I->second)
+ if (isInSchedulingRegion(P.second) && P.second->TE == E)
+ return P.second;
+ return nullptr;
+ }
+
bool isInSchedulingRegion(ScheduleData *SD) const {
return SD->SchedulingRegionID == SchedulingRegionID;
}
@@ -4119,30 +4329,33 @@ class BoUpSLP {
for (ScheduleData *BundleMember = SD; BundleMember;
BundleMember = BundleMember->NextInBundle) {
-
// Handle the def-use chain dependencies.
// Decrement the unscheduled counter and insert to ready list if ready.
- auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
- ScheduleData *OpDef = getScheduleData(I);
- if (OpDef && OpDef->hasValidDependencies() &&
- OpDef->incrementUnscheduledDeps(-1) == 0) {
- // There are no more unscheduled dependencies after
- // decrementing, so we can put the dependent instruction
- // into the ready list.
- ScheduleData *DepBundle = OpDef->FirstInBundle;
- assert(!DepBundle->IsScheduled &&
- "already scheduled bundle gets ready");
- ReadyList.insert(DepBundle);
- LLVM_DEBUG(dbgs()
- << "SLP: gets ready (def): " << *DepBundle << "\n");
- }
+ auto &&DecrUnsched = [this, &ReadyList, &BundleMember](Instruction *I) {
+ doForAllOpcodes(I, [&ReadyList, &BundleMember,
+ &I](ScheduleData *OpDef) {
+ if (OpDef && OpDef->hasValidDependencies() &&
+ BundleMember->Inst != I &&
+ OpDef->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after
+ // decrementing, so we can put the dependent instruction
+ // into the ready list.
+ ScheduleData *DepBundle = OpDef->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (def): " << *DepBundle << "\n");
+ }
+ });
};
// If BundleMember is a vector bundle, its operands may have been
// reordered during buildTree(). We therefore need to get its operands
// through the TreeEntry.
- if (TreeEntry *TE = BundleMember->TE) {
+ if (TreeEntry *TE = BundleMember->TE;
+ TE && TE->State != TreeEntry::CopyableVectorize) {
// Need to search for the lane since the tree entry can be reordered.
auto *In = BundleMember->Inst;
int Lane = std::distance(TE->Scalars.begin(),
@@ -4158,6 +4371,7 @@ class BoUpSLP {
assert(
In &&
(isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
+ BundleMember->IsCopy ||
In->getNumOperands() == TE->getNumOperands()) &&
"Missed TreeEntry operands?");
@@ -4218,7 +4432,8 @@ class BoUpSLP {
"primary schedule data not in window?");
assert(isInSchedulingRegion(SD->FirstInBundle) &&
"entire bundle in window!");
- SD->verify();
+ (void)SD;
+ doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
}
for (auto *SD : ReadyInsts) {
@@ -4228,35 +4443,49 @@ class BoUpSLP {
}
}
+ void doForAllOpcodes(Value *V,
+ function_ref<void(ScheduleData *SD)> Action) {
+ auto I = ExtraScheduleDataMap.find(V);
+ if (I != ExtraScheduleDataMap.end())
+ for (auto &P : I->second)
+ if (isInSchedulingRegion(P.second))
+ Action(P.second);
+ if (ScheduleData *SD = getScheduleData(V))
+ Action(SD);
+ }
+
/// Put all instructions into the ReadyList which are ready for scheduling.
template <typename ReadyListType>
void initialFillReadyList(ReadyListType &ReadyList) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- ScheduleData *SD = getScheduleData(I);
- if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
- SD->isReady()) {
- ReadyList.insert(SD);
- LLVM_DEBUG(dbgs()
- << "SLP: initially in ready list: " << *SD << "\n");
- }
+ doForAllOpcodes(I, [&](ScheduleData *SD) {
+ if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
+ SD->isReady()) {
+ ReadyList.insert(SD);
+ LLVM_DEBUG(dbgs()
+ << "SLP: initially in ready list: " << *SD << "\n");
+ }
+ });
}
}
/// Build a bundle from the ScheduleData nodes corresponding to the
/// scalar instruction for each lane.
- ScheduleData *buildBundle(ArrayRef<Value *> VL);
+ ScheduleData *buildBundle(ArrayRef<Value *> VL, const InstructionsState &S,
+ bool IsCopyable, bool &ReSchedule);
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
/// \returns the scheduling bundle. The returned Optional value is not
/// std::nullopt if \p VL is allowed to be scheduled.
- std::optional<ScheduleData *>
- tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
- const InstructionsState &S);
+ std::optional<ScheduleData *> tryScheduleBundle(ArrayRef<Value *> VL,
+ BoUpSLP *SLP,
+ const InstructionsState &S,
+ bool IsCopyable);
/// Un-bundles a group of instructions.
- void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+ void cancelScheduling(ArrayRef<Value *> VL, ScheduleData *Bundle);
/// Allocates schedule data chunk.
ScheduleData *allocateScheduleDataChunks();
@@ -4296,6 +4525,10 @@ class BoUpSLP {
/// ScheduleData structures are recycled.
DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
+ /// Attaches ScheduleData to Instruction with the leading key.
+ DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
+ ExtraScheduleDataMap;
+
/// The ready-list for scheduling (only used for the dry-run).
SetVector<ScheduleData *> ReadyInsts;
@@ -7490,6 +7723,57 @@ static bool isAlternateInstruction(const Instruction *I,
const Instruction *AltOp,
const TargetLibraryInfo &TLI);
+bool BoUpSLP::canRepresentAsCopyable(const InstructionsState &S,
+ ArrayRef<Value *> VL) {
+ unsigned Opcode0 = S.getOpcode();
+ unsigned Opcode1 = S.getAltOpcode();
+ DenseMap<unsigned, unsigned> AltOps;
+ SmallVector<unsigned> MainAltOps;
+ unsigned Operand;
+
+ if (!checkCopyableInnerDep(VL, S))
+ return false;
+ if (isCopyableOp(VL, S.getMainOp(), S.getAltOp()))
+ return true;
+ if ((isa<BinaryOperator>(S.getMainOp()) && Opcode0 == Opcode1) ||
+ (!isValidForAlternation(Opcode0) || !isValidForAlternation(Opcode1)) ||
+ !tryToRepresentAsInstArg(S.getOpcode(), S.getAltOp()) ||
+ !tryToRepresentAsInstArg(S.getAltOpcode(), S.getMainOp()))
+ return false;
+ for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+ Instruction *Inst = dyn_cast<Instruction>(VL[I]);
+ if (!Inst)
+ return false;
+ if (Inst->getOpcode() == Opcode0) {
+ for (unsigned Op : seq<unsigned>(0, S.getMainOp()->getNumOperands())) {
+ Instruction *Inst1 = dyn_cast<Instruction>(Inst->getOperand(Op));
+ if (!Inst1)
+ continue;
+ if (Inst1->getOpcode() == Opcode0)
+ return false;
+ if (Inst1->isBinaryOp() && !isa<ConstantInt>(Inst1->getOperand(1)))
+ return false;
+ if (AltOps.contains(I) ||
+ (AltOps.size() && Op != Operand && !Inst1->isCommutative()))
+ return false;
+ if (Inst1->getOpcode() == Opcode1) {
+ if (Inst1->isBinaryOp() && !isa<ConstantInt>(Inst1->getOperand(1)))
+ return false;
+ if (!AltOps.size())
+ Operand = Op;
+ AltOps[I] = Op;
+ }
+ }
+ } else if (Inst->getOpcode() == Opcode1) {
+ MainAltOps.push_back(I);
+ }
+ }
+ if (AltOps.size() > 0 && MainAltOps.size() > 0)
+ return true;
+
+ return false;
+}
+
bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
ArrayRef<Value *> VL) const {
unsigned Opcode0 = S.getOpcode();
@@ -7500,6 +7784,8 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
Opcode0, Opcode1, OpcodeMask))
return true;
SmallVector<ValueList> Operands;
+ if (S.getMainOp()->getNumOperands() != S.getAltOp()->getNumOperands())
+ return false;
for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
Operands.emplace_back();
// Prepare the operand vector.
@@ -7947,6 +8233,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
return TreeEntry::Vectorize;
}
case Instruction::ShuffleVector: {
+ if (VectorizeCopyable && isCopyableOp(VL, S.getMainOp(), S.getAltOp()) &&
+ checkCopyableInnerDep(VL, S))
+ return TreeEntry::CopyableVectorize;
if (!S.isAltShuffle()) {
// REVEC can support non alternate shuffle.
if (SLPReVec && getShufflevectorNumGroups(VL))
@@ -7964,6 +8253,14 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
return TreeEntry::NeedToGather;
}
+ if (VectorizeCopyable) {
+ if (canRepresentAsCopyable(S, VL))
+ return TreeEntry::CopyableVectorize;
+
+ if (!tryToRepresentAsInstArg(S.getOpcode(), S.getAltOp()) ||
+ !tryToRepresentAsInstArg(S.getAltOpcode(), S.getMainOp()))
+ return TreeEntry::NeedToGather;
+ }
return TreeEntry::Vectorize;
}
default:
@@ -8258,6 +8555,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return false;
if (Depth >= RecursionMaxDepth - 1)
return true;
+
// Check if all operands are extracts, part of vector node or can build a
// regular vectorize node.
SmallVector<unsigned, 8> InstsCount;
@@ -8278,6 +8576,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
auto *I1 = cast<Instruction>(VL.front());
auto *I2 = cast<Instruction>(VL.back());
+ if (I1->getNumOperands() != I2->getNumOperands())
+ return true;
for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
I2->getOperand(Op));
@@ -8418,7 +8718,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
SmallVector<Value *> PointerOps;
TreeEntry::EntryState State = getScalarsVectorizationState(
S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
- if (State == TreeEntry::NeedToGather) {
+ if (State == TreeEntry::NeedToGather ||
+ (State == TreeEntry::CopyableVectorize &&
+ !has_single_bit(UniqueValues.size()))) {
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
return;
@@ -8429,18 +8731,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BSRef = std::make_unique<BlockScheduling>(BB);
BlockScheduling &BS = *BSRef;
-
- std::optional<ScheduleData *> Bundle =
- BS.tryScheduleBundle(UniqueValues, this, S);
+ std::optional<ScheduleData *> Bundle;
+ Bundle = BS.tryScheduleBundle(UniqueValues, this, S,
+ State == TreeEntry::CopyableVectorize);
#ifdef EXPENSIVE_CHECKS
// Make sure we didn't break any internal invariants
BS.verify();
#endif
- if (!Bundle) {
+ if (!Bundle || (State == TreeEntry::CopyableVectorize && !Bundle.value())) {
LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
- assert((!BS.getScheduleData(VL0) ||
- !BS.getScheduleData(VL0)->isPartOfBundle()) &&
- "tryScheduleBundle should cancelScheduling on failure");
+ assert(
+ (!BS.getScheduleData(VL0) ||
+ !BS.getScheduleData(VL0)->isPartOfBundle() ||
+ State == TreeEntry::CopyableVectorize ||
+ (BS.getScheduleData(VL0)->TE && BS.getScheduleData(VL0)->TE->State ==
+ TreeEntry::CopyableVectorize)) &&
+ "tryScheduleBundle should cancelScheduling on failure");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
NonScheduledFirst.insert(VL.front());
@@ -8585,6 +8891,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TE->dump());
break;
case TreeEntry::CombinedVectorize:
+ case TreeEntry::CopyableVectorize:
case TreeEntry::NeedToGather:
llvm_unreachable("Unexpected loads state.");
}
@@ -8829,8 +9136,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
case Instruction::ShuffleVector: {
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
+ TreeEntry *TE =
+ (State != TreeEntry::CopyableVectorize)
+ ? newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices)
+ : newTreeEntry(VL, TreeEntry::CopyableVectorize, Bundle, S,
+ UserTreeIdx, ReuseShuffleIndices);
if (S.isAltShuffle()) {
LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
TE->dump());
@@ -8841,6 +9152,79 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TE->dump());
}
+ if (State == TreeEntry::CopyableVectorize &&
+ !isCopyableOp(VL, S.getMainOp(), S.getAltOp())) {
+ ValueList Left, Right;
+ unsigned Opcode0 = S.getOpcode();
+ unsigned Opcode1 = S.getAltOpcode();
+
+ unsigned Operand;
+ bool IsOperandSet = false;
+ ValueList newMainVL;
+ ValueList newVL;
+
+ for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+ Instruction *Inst = cast<Instruction>(VL[I]);
+ if (Inst->getOpcode() == Opcode0) {
+ newMainVL.push_back(VL[I]);
+ unsigned Op = 0;
+ Instruction *Inst1 = dyn_cast<Instruction>(Inst->getOperand(Op));
+ if (!Inst1) {
+ newVL.push_back(Inst->getOperand(Op));
+ continue;
+ }
+
+ if (IsOperandSet && Op != Operand && !Inst1->isCommutative())
+ return;
+
+ if (Inst1->getOpcode() == Opcode1) {
+ if (!IsOperandSet) {
+ Operand = Op;
+ IsOperandSet = true;
+ }
+ }
+ newVL.push_back(Inst1);
+ } else if (Inst->getOpcode() == Opcode1) {
+ newVL.push_back(Inst);
+ }
+ }
+ VLOperands Ops(VL, S, *this);
+ Left = Ops.getVL(0);
+ Right = Ops.getVL(1);
+ for (unsigned I = 0, VF = VL.size(); I < VF; ++I)
+ if ((cast<Instruction>(VL[I]))->getOpcode() != Opcode0) {
+ Right[I] = ConstantExpr::getBinOpIdentity(
+ Opcode0, Right[0]->getType(), true);
+ }
+
+ TE->setOperand(0, newVL);
+ TE->setOperand(1, Right);
+ buildTree_rec(newVL, Depth + 1, {TE, 0});
+ buildTree_rec(Right, Depth + 1, {TE, 1});
+ return;
+ } else if (State == TreeEntry::CopyableVectorize) {
+ ValueList Left, Right;
+ unsigned Opcode0 = S.getOpcode();
+ VLOperands Ops(VL, S, *this);
+ Left = Ops.getVL(0);
+ Right = Ops.getVL(1);
+ ValueList Left_new, Right_new;
+ for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
+ if ((cast<Instruction>(VL[I]))->getOpcode() != Opcode0) {
+ Left_new.push_back(VL[I]);
+ Right_new.push_back(ConstantExpr::getBinOpIdentity(
+ Opcode0, S.getMainOp()->getType(), true));
+ } else {
+ Left_new.push_back(Left[I]);
+ Right_new.push_back(Right[I]);
+ }
+ }
+ TE->setOperand(0, Left_new);
+ TE->setOperand(1, Right_new);
+ buildTree_rec(Left_new, Depth + 1, {TE, 0});
+ buildTree_rec(Right_new, Depth + 1, {TE, 1});
+ return;
+ }
// Reorder operands if reordering would enable vectorization.
auto *CI = dyn_cast<CmpInst>(VL0);
if (CI && any_of(VL, [](Value *V) {
@@ -11147,7 +11531,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize ||
- E->State == TreeEntry::StridedVectorize) &&
+ E->State == TreeEntry::StridedVectorize ||
+ E->State == TreeEntry::CopyableVectorize) &&
"Unhandled state");
assert(E->getOpcode() &&
((allSameType(VL) && allSameBlock(VL)) ||
@@ -11156,7 +11541,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
"Invalid VL");
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
- E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+ (E->isAltShuffle() && E->State != TreeEntry::CopyableVectorize)
+ ? (unsigned)Instruction::ShuffleVector
+ : E->getOpcode();
if (E->CombinedOp != TreeEntry::NotCombinedOp)
ShuffleOrOp = E->CombinedOp;
SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
@@ -11237,7 +11624,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// Negative value means vectorizing is profitable.
auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
assert((E->State == TreeEntry::Vectorize ||
- E->State == TreeEntry::StridedVectorize) &&
+ E->State == TreeEntry::StridedVectorize ||
+ E->State == TreeEntry::CopyableVectorize) &&
"Entry state expected to be Vectorize or StridedVectorize here.");
InstructionCost ScalarCost = 0;
InstructionCost VecCost = 0;
@@ -11669,6 +12057,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
auto GetVectorCost = [&](InstructionCost CommonCost) {
InstructionCost VecLdCost;
switch (E->State) {
+ case TreeEntry::CopyableVectorize:
case TreeEntry::Vectorize:
if (unsigned Factor = E->getInterleaveFactor()) {
VecLdCost = TTI->getInterleavedMemoryOpCost(
@@ -11794,7 +12183,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
Instruction::isBinaryOp(E->getAltOpcode())) ||
(Instruction::isCast(E->getOpcode()) &&
Instruction::isCast(E->getAltOpcode())) ||
- (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
+ (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp())) ||
+ E->State == TreeEntry::CopyableVectorize) &&
"Invalid Shuffle Vector Operand");
// Try to find the previous shuffle node with the same operands and same
// main/alternate ops.
@@ -12550,6 +12940,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
std::optional<unsigned> InsertIdx = getElementIndex(VU);
if (InsertIdx) {
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
+ if (!ScalarTE && CopyableAltOp.contains(EU.Scalar))
+ continue;
auto *It = find_if(
ShuffledInserts,
[this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
@@ -12632,8 +13024,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
EU.Lane, EU.Scalar, ScalarUserAndIdx);
}
// Leave the scalar instructions as is if they are cheaper than extracts.
- if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
- Entry->getOpcode() == Instruction::Load) {
+ if (Entry &&
+ (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
+ Entry->getOpcode() == Instruction::Load)) {
// Checks if the user of the external scalar is phi in loop body.
auto IsPhiInLoop = [&](const ExternalUser &U) {
if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
@@ -13876,13 +14269,25 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
// VL.back() and iterate over schedule data until we reach the end of the
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB) && !E->isGather()) {
- Value *V = E->isOneOf(E->Scalars.back());
+ Value *V = E->getMainOp();
if (doesNotNeedToBeScheduled(V))
V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
- auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
+ auto *Bundle = BlocksSchedules[BB]->getScheduleData(V, E);
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
Res = Bundle->Inst;
+ // Somehow we could not reply on the SLP scedualar for copyable operations,
+ // because there might be inner dependencies that we could not schedule
+ // correctly.
+ if (E->State == TreeEntry::CopyableVectorize) {
+ for (Value *V : E->Scalars) {
+ if (!isa<Instruction>(V))
+ continue;
+ Instruction *Inst = cast<Instruction>(V);
+ if (Res->comesBefore(Inst))
+ Res = Inst;
+ }
+ }
}
// LastInst can still be null at this point if there's either not an entry
@@ -14632,6 +15037,21 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
unsigned NodeIdx) {
ArrayRef<Value *> VL = E->getOperand(NodeIdx);
InstructionsState S = getSameOpcode(VL, *TLI);
+ if (E->State == TreeEntry::CopyableVectorize) {
+ unsigned Opcode = E->getMainOp()->getOpcode();
+ for (Value *V : VL) {
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (I->getOpcode() == Opcode) {
+ TreeEntry *VE = getTreeEntry(V);
+ if (!VE)
+ return nullptr;
+ if (VE->State == TreeEntry::CopyableVectorize)
+ return VE;
+ }
+ }
+ }
// Special processing for GEPs bundle, which may include non-gep values.
if (!S && VL.front()->getType()->isPointerTy()) {
const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
@@ -14656,6 +15076,10 @@ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
TreeEntry *VE = getTreeEntry(S.getMainOp());
if (VE && CheckSameVE(VE))
return VE;
+ if (!VE || !CheckSameVE(VE))
+ VE = getTreeEntry(S.getAltOp());
+ if (VE && VE->State == TreeEntry::CopyableVectorize)
+ return VE;
auto It = MultiNodeScalars.find(S.getMainOp());
if (It != MultiNodeScalars.end()) {
auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
@@ -16486,6 +16910,8 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
if (User && !is_contained(Scalar->users(), User))
continue;
TreeEntry *E = getTreeEntry(Scalar);
+ if (!E && CopyableAltOp.contains(Scalar))
+ continue;
assert(E && "Invalid scalar");
assert(!E->isGather() && "Extracting from a gather list");
// Non-instruction pointers are not deleted, just skip them.
@@ -16873,6 +17299,9 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
continue;
if (isa<PoisonValue>(Scalar))
continue;
+ if (Entry->State == TreeEntry::CopyableVectorize &&
+ cast<Instruction>(Scalar)->getOpcode() != Entry->getOpcode())
+ continue;
#ifndef NDEBUG
Type *Ty = Scalar->getType();
if (!Ty->isVoidTy()) {
@@ -17114,17 +17543,66 @@ void BoUpSLP::optimizeGatherSequence() {
}
BoUpSLP::ScheduleData *
-BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
+BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S,
+ bool IsCopyable, bool &ReSchedule) {
ScheduleData *Bundle = nullptr;
ScheduleData *PrevInBundle = nullptr;
+ unsigned Opcode = S.getOpcode();
+ ValueList Keys;
+
for (Value *V : VL) {
+ auto *SD = getScheduleData(V);
+ bool FoundKey = false;
+ if (SD && !SD->isPartOfBundle()) {
+ Keys.push_back(V);
+ continue;
+ }
+ for (Value *Key : VL) {
+ SD = getScheduleData(V, Key);
+ if (SD && SD->isPartOfBundle()) {
+ ReSchedule = true;
+ } else if (!SD || !SD->isPartOfBundle()) {
+ FoundKey = true;
+ Keys.push_back(Key);
+ break;
+ }
+ }
+ if (!FoundKey) {
+ for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E;
+ ++It) {
+ Value *Key = &*It;
+ if (!Key)
+ continue;
+ SD = getScheduleData(V, Key);
+ if (!SD || !SD->isPartOfBundle()) {
+ FoundKey = true;
+ Keys.push_back(Key);
+ break;
+ }
+ }
+ }
+ }
+
+ for (auto [V, Key] : zip(VL, Keys)) {
if (doesNotNeedToBeScheduled(V))
continue;
- ScheduleData *BundleMember = getScheduleData(V);
+ Instruction *I = dyn_cast<Instruction>(V);
+ bool IsAltInst = (I) ? I->getOpcode() != Opcode : false;
+
+ ScheduleData *BundleMember = getScheduleData(V, Key);
+ if (V != Key) {
+ ScheduleData *SD = allocateScheduleDataChunks();
+ Instruction *I = dyn_cast<Instruction>(V);
+ SD->Inst = I;
+ SD->init(SchedulingRegionID, Key);
+ ExtraScheduleDataMap[I][Key] = SD;
+ BundleMember = getScheduleData(V, Key);
+ }
assert(BundleMember &&
"no ScheduleData for bundle member "
"(maybe not in same basic block)");
- assert(BundleMember->isSchedulingEntity() &&
+ assert(BundleMember->isSchedulingEntity() ||
"bundle member already part of other bundle");
if (PrevInBundle) {
PrevInBundle->NextInBundle = BundleMember;
@@ -17134,6 +17612,8 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
// Group the instructions to a bundle.
BundleMember->FirstInBundle = Bundle;
+ if (IsCopyable && IsAltInst)
+ BundleMember->IsCopy = true;
PrevInBundle = BundleMember;
}
assert(Bundle && "Failed to find schedule bundle");
@@ -17144,7 +17624,9 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
// and schedules instructions until the bundle gets ready.
std::optional<BoUpSLP::ScheduleData *>
BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
- const InstructionsState &S) {
+ const InstructionsState &S,
+ bool IsCopyable) {
+ bool AnyCopyable = false;
// No need to schedule PHIs, insertelement, extractelement and extractvalue
// instructions.
if (isa<PHINode>(S.getMainOp()) ||
@@ -17155,8 +17637,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
Instruction *OldScheduleEnd = ScheduleEnd;
LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
- auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
- ScheduleData *Bundle) {
+ auto TryScheduleBundleImpl = [this, OldScheduleEnd, &AnyCopyable,
+ SLP](bool ReSchedule, ScheduleData *Bundle) {
// The scheduling region got new instructions at the lower end (or it is a
// new region for the first bundle). This makes it necessary to
// recalculate all dependencies.
@@ -17164,8 +17646,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// initial bundle to the region.
if (ScheduleEnd != OldScheduleEnd) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
- if (ScheduleData *SD = getScheduleData(I))
- SD->clearDependencies();
+ doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
ReSchedule = true;
}
if (Bundle) {
@@ -17186,6 +17667,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
!ReadyInsts.empty()) {
ScheduleData *Picked = ReadyInsts.pop_back_val();
+ if (Picked->TE && Picked->TE->State == TreeEntry::CopyableVectorize)
+ AnyCopyable = true;
assert(Picked->isSchedulingEntity() && Picked->isReady() &&
"must be ready to schedule");
schedule(Picked, ReadyInsts);
@@ -17231,24 +17714,35 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
ReSchedule = true;
}
- auto *Bundle = buildBundle(VL);
+ auto *Bundle = buildBundle(VL, S, IsCopyable, ReSchedule);
+ if (!Bundle)
+ return std::nullopt;
TryScheduleBundleImpl(ReSchedule, Bundle);
if (!Bundle->isReady()) {
- cancelScheduling(VL, S.getMainOp());
+ cancelScheduling(VL, Bundle);
+ // In case we have any copyable element then we have to clear
+ // all dependencies, since all values were calculated for
+ // the vectorized bundle.
+ if (AnyCopyable) {
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
+ }
+ resetSchedule();
+ }
return std::nullopt;
}
return Bundle;
}
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
- Value *OpValue) {
- if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
- doesNotNeedToSchedule(VL))
+ ScheduleData *Bundle) {
+ if (isa<PHINode>(VL.front()) || isVectorLikeInstWithConstOps(VL.front()) ||
+ doesNotNeedToSchedule(VL) || !Bundle)
return;
- if (doesNotNeedToBeScheduled(OpValue))
- OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
- ScheduleData *Bundle = getScheduleData(OpValue);
+ if (Bundle->FirstInBundle)
+ Bundle = Bundle->FirstInBundle;
+
LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
assert(!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled");
@@ -17271,6 +17765,13 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
if (BundleMember->unscheduledDepsInBundle() == 0) {
ReadyInsts.insert(BundleMember);
}
+ auto I = ExtraScheduleDataMap.find(BundleMember->Inst);
+ if (I != ExtraScheduleDataMap.end()) {
+ for (auto &SD : I->second) {
+ if (SD.second == BundleMember)
+ ExtraScheduleDataMap[BundleMember->Inst].erase(SD.first);
+ }
+ }
BundleMember = Next;
}
}
@@ -17286,19 +17787,34 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V, const InstructionsState &S) {
+ if (getScheduleData(V, S.getMainOp()))
+ return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
!doesNotNeedToBeScheduled(I) &&
"phi nodes/insertelements/extractelements/extractvalues don't need to "
"be scheduled");
- if (getScheduleData(I))
+ auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
+ ScheduleData *ISD = getScheduleData(I);
+ if (!ISD)
+ return false;
+ assert(isInSchedulingRegion(ISD) &&
+ "ScheduleData not in scheduling region");
+ ScheduleData *SD = allocateScheduleDataChunks();
+ SD->Inst = I;
+ SD->init(SchedulingRegionID, S.getMainOp());
+ return true;
+ };
+ if (CheckScheduleForI(I))
return true;
if (!ScheduleStart) {
// It's the first instruction in the new region.
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
ScheduleStart = I;
ScheduleEnd = I->getNextNode();
+ if (isOneOf(S, I) != I)
+ CheckScheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a terminator?");
LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
return true;
@@ -17337,6 +17853,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
"Instruction is in wrong basic block.");
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
+ if (isOneOf(S, I) != I)
+ CheckScheduleForI(I);
LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
<< "\n");
return true;
@@ -17349,6 +17867,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
nullptr);
ScheduleEnd = I->getNextNode();
+ if (isOneOf(S, I) != I)
+ CheckScheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a terminator?");
LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
return true;
@@ -17367,6 +17887,7 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
if (!SD) {
SD = allocateScheduleDataChunks();
ScheduleDataMap[I] = SD;
+ SD->Inst = I;
}
assert(!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region");
@@ -17421,26 +17942,32 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
// Handle def-use chain dependencies.
for (User *U : BundleMember->Inst->users()) {
- if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
+ if (auto *I = dyn_cast<Instruction>(U)) {
+ doForAllOpcodes(I, [&](ScheduleData *UseSD) {
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if ((UseSD->IsCopy || BundleMember->IsCopy) &&
+ DestBundle == BundleMember->FirstInBundle)
+ return;
+ BundleMember->Dependencies++;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ });
+ }
+ }
+
+ auto MakeControlDependent = [&](Instruction *I) {
+ doForAllOpcodes(I, [&](ScheduleData *DepDest) {
+ assert(DepDest && "must be in schedule window");
+ DepDest->ControlDependencies.push_back(BundleMember);
BundleMember->Dependencies++;
- ScheduleData *DestBundle = UseSD->FirstInBundle;
+ ScheduleData *DestBundle = DepDest->FirstInBundle;
if (!DestBundle->IsScheduled)
BundleMember->incrementUnscheduledDeps(1);
if (!DestBundle->hasValidDependencies())
WorkList.push_back(DestBundle);
- }
- }
-
- auto MakeControlDependent = [&](Instruction *I) {
- auto *DepDest = getScheduleData(I);
- assert(DepDest && "must be in schedule window");
- DepDest->ControlDependencies.push_back(BundleMember);
- BundleMember->Dependencies++;
- ScheduleData *DestBundle = DepDest->FirstInBundle;
- if (!DestBundle->IsScheduled)
- BundleMember->incrementUnscheduledDeps(1);
- if (!DestBundle->hasValidDependencies())
- WorkList.push_back(DestBundle);
+ });
};
// Any instruction which isn't safe to speculate at the beginning of the
@@ -17576,12 +18103,12 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
assert(ScheduleStart &&
"tried to reset schedule on block which has not been scheduled");
for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- if (ScheduleData *SD = getScheduleData(I)) {
+ doForAllOpcodes(I, [&](ScheduleData *SD) {
assert(isInSchedulingRegion(SD) &&
"ScheduleData not in scheduling region");
SD->IsScheduled = false;
SD->resetUnscheduledDeps();
- }
+ });
}
ReadyInsts.clear();
}
@@ -17616,8 +18143,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
int Idx = 0;
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
- if (ScheduleData *SD = BS->getScheduleData(I)) {
- [[maybe_unused]] TreeEntry *SDTE = getTreeEntry(SD->Inst);
+ BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
+ [[maybe_unused]] TreeEntry *SDTE = SD->TE;
assert((isVectorLikeInstWithConstOps(SD->Inst) ||
SD->isPartOfBundle() ==
(SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
@@ -17626,7 +18153,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
if (SD->isSchedulingEntity() && SD->isPartOfBundle())
BS->calculateDependencies(SD, false, this);
- }
+ });
}
BS->initialFillReadyList(ReadyInsts);
@@ -17642,7 +18169,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
for (ScheduleData *BundleMember = Picked; BundleMember;
BundleMember = BundleMember->NextInBundle) {
Instruction *PickedInst = BundleMember->Inst;
- if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
+ if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst &&
+ LastScheduledInst->getPrevNode())
PickedInst->moveAfter(LastScheduledInst->getPrevNode());
LastScheduledInst = PickedInst;
}
@@ -17658,9 +18186,11 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
// Check that all schedulable entities got scheduled
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
- ScheduleData *SD = BS->getScheduleData(I);
- if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
- assert(SD->IsScheduled && "must be scheduled at this point");
+ BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
+ if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
+ assert(SD->IsScheduled && "must be scheduled at this point");
+ }
+ });
}
#endif
@@ -17771,6 +18301,9 @@ bool BoUpSLP::collectValuesToDemote(
if (NodesToKeepBWs.contains(E.Idx))
return false;
+ if (E.State == TreeEntry::CopyableVectorize)
+ return false;
+
// If the value is not a vectorized instruction in the expression and not used
// by the insertelement instruction and not used in multiple vector nodes, it
// cannot be demoted.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 869a9d1aee80e3..4bbff7b513859a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -slp-vectorize-copyable=true -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,COPYABLE %s
define void @add0(ptr noalias %dst, ptr noalias %src) {
; CHECK-LABEL: @add0(
@@ -60,6 +61,13 @@ define void @add1(ptr noalias %dst, ptr noalias %src) {
; POW2-ONLY-NEXT: store i32 [[ADD9]], ptr [[INCDEC_PTR7]], align 4
; POW2-ONLY-NEXT: ret void
;
+; COPYABLE-LABEL: @add1(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
+; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: ret void
+;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
%0 = load i32, ptr %src, align 4
@@ -82,21 +90,44 @@ entry:
}
define void @sub0(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @sub0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @sub0(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; NON-POW2-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; NON-POW2-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; NON-POW2-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @sub0(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @sub0(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 -1, i32 0, i32 -2, i32 -3>
+; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -180,23 +211,55 @@ entry:
}
define void @addsub0(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
-; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @addsub0(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; NON-POW2-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; NON-POW2-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; NON-POW2-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @addsub0(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT: store i32 [[SUB]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @addsub0(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 0>
+; COPYABLE-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; COPYABLE-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; COPYABLE-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; COPYABLE-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -220,23 +283,55 @@ entry:
}
define void @addsub1(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; CHECK-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
-; CHECK-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @addsub1(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; NON-POW2-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; NON-POW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; NON-POW2-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
+; NON-POW2-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @addsub1(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
+; POW2-ONLY-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @addsub1(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; COPYABLE-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; COPYABLE-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1)
+; COPYABLE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; COPYABLE-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT: [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP4]], <i32 0, i32 -3>
+; COPYABLE-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -260,21 +355,44 @@ entry:
}
define void @mul(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @mul(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
-; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; CHECK-NEXT: store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
-; CHECK-NEXT: store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @mul(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
+; NON-POW2-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; NON-POW2-NEXT: [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; NON-POW2-NEXT: store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; NON-POW2-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; NON-POW2-NEXT: store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @mul(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT: store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; POW2-ONLY-NEXT: store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @mul(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[TMP0]], <i32 257, i32 -3, i32 1, i32 -9>
+; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -325,6 +443,13 @@ define void @shl0(ptr noalias %dst, ptr noalias %src) {
; POW2-ONLY-NEXT: store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
; POW2-ONLY-NEXT: ret void
;
+; COPYABLE-LABEL: @shl0(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; COPYABLE-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
+; COPYABLE-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; COPYABLE-NEXT: ret void
+;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
%0 = load i32, ptr %src, align 4
@@ -434,6 +559,22 @@ define void @add1f(ptr noalias %dst, ptr noalias %src) {
; POW2-ONLY-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
; POW2-ONLY-NEXT: ret void
;
+; COPYABLE-LABEL: @add1f(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT: store float [[TMP0]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; COPYABLE-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; COPYABLE-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; COPYABLE-NEXT: store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; COPYABLE-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; COPYABLE-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; COPYABLE-NEXT: ret void
+;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
%0 = load float, ptr %src, align 4
@@ -554,23 +695,62 @@ entry:
}
define void @addsub0f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub0f(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT: store float [[SUB]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @addsub0f(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT: store float [[SUB]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; NON-POW2-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; NON-POW2-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @addsub0f(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT: store float [[SUB]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @addsub0f(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; COPYABLE-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT: store float [[SUB]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; COPYABLE-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; COPYABLE-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; COPYABLE-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
+; COPYABLE-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; COPYABLE-NEXT: store float [[SUB5]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; COPYABLE-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
+; COPYABLE-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -594,23 +774,62 @@ entry:
}
define void @addsub1f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @addsub1f(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
-; CHECK-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @addsub1f(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; NON-POW2-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; NON-POW2-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; NON-POW2-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; NON-POW2-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; NON-POW2-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; NON-POW2-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
+; NON-POW2-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @addsub1f(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; POW2-ONLY-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], splat (float -1.000000e+00)
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
+; POW2-ONLY-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @addsub1f(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; COPYABLE-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT: store float [[SUB]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; COPYABLE-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
+; COPYABLE-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; COPYABLE-NEXT: store float [[SUB1]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; COPYABLE-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; COPYABLE-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR3]], align 4
+; COPYABLE-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; COPYABLE-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
+; COPYABLE-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
+; COPYABLE-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -729,6 +948,22 @@ define void @add1fn(ptr noalias %dst, ptr noalias %src) {
; POW2-ONLY-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
; POW2-ONLY-NEXT: ret void
;
+; COPYABLE-LABEL: @add1fn(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; COPYABLE-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; COPYABLE-NEXT: store float [[TMP0]], ptr [[DST]], align 4
+; COPYABLE-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; COPYABLE-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; COPYABLE-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; COPYABLE-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; COPYABLE-NEXT: store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; COPYABLE-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; COPYABLE-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; COPYABLE-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; COPYABLE-NEXT: ret void
+;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
%0 = load float, ptr %src, align 4
@@ -885,3 +1120,106 @@ entry:
store float %sub9, ptr %incdec.ptr7, align 4
ret void
}
+
+define void @and_lshr(ptr %0, ptr %1, float %2, float %3) {
+; NON-POW2-LABEL: @and_lshr(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP1:%.*]], align 1
+; NON-POW2-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; NON-POW2-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP5]], 2
+; NON-POW2-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP5]], 4
+; NON-POW2-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; NON-POW2-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP6]], i32 1
+; NON-POW2-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP7]], i32 2
+; NON-POW2-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+; NON-POW2-NEXT: [[TMP12:%.*]] = and <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; NON-POW2-NEXT: [[TMP13:%.*]] = lshr <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; NON-POW2-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; NON-POW2-NEXT: [[TMP15:%.*]] = sitofp <4 x i32> [[TMP14]] to <4 x float>
+; NON-POW2-NEXT: [[TMP16:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 4
+; NON-POW2-NEXT: [[TMP17:%.*]] = insertelement <4 x float> poison, float [[TMP3:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP18:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> poison, <4 x i32> zeroinitializer
+; NON-POW2-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP16]], [[TMP18]]
+; NON-POW2-NEXT: [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP2:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <4 x i32> zeroinitializer
+; NON-POW2-NEXT: [[TMP22:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP21]], <4 x float> [[TMP15]], <4 x float> [[TMP19]])
+; NON-POW2-NEXT: store <4 x float> [[TMP22]], ptr [[TMP0]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @and_lshr(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP1:%.*]], align 1
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP5]], 2
+; POW2-ONLY-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP5]], 4
+; POW2-ONLY-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP6]], i32 1
+; POW2-ONLY-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP7]], i32 2
+; POW2-ONLY-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+; POW2-ONLY-NEXT: [[TMP12:%.*]] = and <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; POW2-ONLY-NEXT: [[TMP13:%.*]] = lshr <4 x i32> [[TMP11]], <i32 3, i32 3, i32 3, i32 6>
+; POW2-ONLY-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; POW2-ONLY-NEXT: [[TMP15:%.*]] = sitofp <4 x i32> [[TMP14]] to <4 x float>
+; POW2-ONLY-NEXT: [[TMP16:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 4
+; POW2-ONLY-NEXT: [[TMP17:%.*]] = insertelement <4 x float> poison, float [[TMP3:%.*]], i32 0
+; POW2-ONLY-NEXT: [[TMP18:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> poison, <4 x i32> zeroinitializer
+; POW2-ONLY-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP16]], [[TMP18]]
+; POW2-ONLY-NEXT: [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP2:%.*]], i32 0
+; POW2-ONLY-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <4 x i32> zeroinitializer
+; POW2-ONLY-NEXT: [[TMP22:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP21]], <4 x float> [[TMP15]], <4 x float> [[TMP19]])
+; POW2-ONLY-NEXT: store <4 x float> [[TMP22]], ptr [[TMP0]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; COPYABLE-LABEL: @and_lshr(
+; COPYABLE-NEXT: entry:
+; COPYABLE-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP1:%.*]], align 1
+; COPYABLE-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; COPYABLE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; COPYABLE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
+; COPYABLE-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 0, i32 2, i32 4, i32 6>
+; COPYABLE-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], <i32 3, i32 3, i32 3, i32 -1>
+; COPYABLE-NEXT: [[TMP10:%.*]] = sitofp <4 x i32> [[TMP9]] to <4 x float>
+; COPYABLE-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 4
+; COPYABLE-NEXT: [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP3:%.*]], i32 0
+; COPYABLE-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> poison, <4 x i32> zeroinitializer
+; COPYABLE-NEXT: [[TMP14:%.*]] = fadd <4 x float> [[TMP11]], [[TMP13]]
+; COPYABLE-NEXT: [[TMP15:%.*]] = insertelement <4 x float> poison, float [[TMP2:%.*]], i32 0
+; COPYABLE-NEXT: [[TMP16:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> poison, <4 x i32> zeroinitializer
+; COPYABLE-NEXT: [[TMP17:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP16]], <4 x float> [[TMP10]], <4 x float> [[TMP14]])
+; COPYABLE-NEXT: store <4 x float> [[TMP17]], ptr [[TMP0]], align 4
+; COPYABLE-NEXT: ret void
+;
+entry:
+ %5 = getelementptr inbounds float, ptr %0, i64 1
+ %6 = getelementptr inbounds float, ptr %0, i64 2
+ %7 = getelementptr inbounds float, ptr %0, i64 3
+ %8 = load i8, ptr %1, align 1
+ %9 = zext i8 %8 to i32
+ %10 = and i32 %9, 3
+ %11 = sitofp i32 %10 to float
+ %12 = lshr i32 %9, 2
+ %13 = and i32 %12, 3
+ %14 = sitofp i32 %13 to float
+ %15 = lshr i32 %9, 4
+ %16 = and i32 %15, 3
+ %17 = sitofp i32 %16 to float
+ %18 = lshr i32 %9, 6
+ %19 = sitofp i32 %18 to float
+ %20 = load float, ptr %0, align 4
+ %21 = fadd float %20, %3
+ %22 = tail call noundef float @llvm.fma.f32(float %2, float %11, float %21)
+ store float %22, ptr %0, align 4
+ %23 = load float, ptr %5, align 4
+ %24 = fadd float %23, %3
+ %25 = tail call noundef float @llvm.fma.f32(float %2, float %14, float %24)
+ store float %25, ptr %5, align 4
+ %26 = load float, ptr %6, align 4
+ %27 = fadd float %26, %3
+ %28 = tail call noundef float @llvm.fma.f32(float %2, float %17, float %27)
+ store float %28, ptr %6, align 4
+ %29 = load float, ptr %7, align 4
+ %30 = fadd float %29, %3
+ %31 = tail call noundef float @llvm.fma.f32(float %2, float %19, float %30)
+ store float %31, ptr %7, align 4
+ ret void
+}
More information about the llvm-commits
mailing list