[llvm] 8835719 - Revert "[SLP]Improve reductions analysis and emission, part 1."
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 19 06:18:02 PDT 2022
Author: Alexey Bataev
Date: 2022-04-19T06:17:03-07:00
New Revision: 883571928c3416bb46e0cb63344915fccd9e6192
URL: https://github.com/llvm/llvm-project/commit/883571928c3416bb46e0cb63344915fccd9e6192
DIFF: https://github.com/llvm/llvm-project/commit/883571928c3416bb46e0cb63344915fccd9e6192.diff
LOG: Revert "[SLP]Improve reductions analysis and emission, part 1."
This reverts commit 0e1f4d4d3cb08ff84df5adc4f5e41d0a2cebc53d to fix
a crash reported in PR54976
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9d909de34eb6e..4dc964d1c9d0b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3986,83 +3986,6 @@ static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
}
#endif
-/// Generates key/subkey pair for the given value to provide effective sorting
-/// of the values and better detection of the vectorizable values sequences. The
-/// keys/subkeys can be used for better sorting of the values themselves (keys)
-/// and in values subgroups (subkeys).
-static std::pair<size_t, size_t> generateKeySubkey(
- Value *V, const TargetLibraryInfo *TLI,
- function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
- bool AllowAlternate) {
- hash_code Key = hash_value(V->getValueID() + 2);
- hash_code SubKey = hash_value(0);
- // Sort the loads by the distance between the pointers.
- if (auto *LI = dyn_cast<LoadInst>(V)) {
- Key = hash_combine(hash_value(LI->getParent()), Key);
- if (LI->isSimple())
- SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
- else
- SubKey = hash_value(LI);
- } else if (isVectorLikeInstWithConstOps(V)) {
- // Sort extracts by the vector operands.
- if (isa<ExtractElementInst, UndefValue>(V))
- Key = hash_value(Value::UndefValueVal + 1);
- if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
- if (!isUndefVector(EI->getVectorOperand()) &&
- !isa<UndefValue>(EI->getIndexOperand()))
- SubKey = hash_value(EI->getVectorOperand());
- }
- } else if (auto *I = dyn_cast<Instruction>(V)) {
- // Sort other instructions just by the opcodes except for CMPInst.
- // For CMP also sort by the predicate kind.
- if ((isa<BinaryOperator>(I) || isa<CastInst>(I)) &&
- isValidForAlternation(I->getOpcode())) {
- if (AllowAlternate)
- Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
- else
- Key = hash_combine(hash_value(I->getOpcode()), Key);
- SubKey = hash_combine(
- hash_value(I->getOpcode()), hash_value(I->getType()),
- hash_value(isa<BinaryOperator>(I)
- ? I->getType()
- : cast<CastInst>(I)->getOperand(0)->getType()));
- } else if (auto *CI = dyn_cast<CmpInst>(I)) {
- CmpInst::Predicate Pred = CI->getPredicate();
- if (CI->isCommutative())
- Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
- CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
- SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
- hash_value(SwapPred),
- hash_value(CI->getOperand(0)->getType()));
- } else if (auto *Call = dyn_cast<CallInst>(I)) {
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
- if (isTriviallyVectorizable(ID))
- SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
- else if (!VFDatabase(*Call).getMappings(*Call).empty())
- SubKey = hash_combine(hash_value(I->getOpcode()),
- hash_value(Call->getCalledFunction()));
- else
- SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
- for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
- SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
- hash_value(Op.Tag), SubKey);
- } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
- if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
- SubKey = hash_value(Gep->getPointerOperand());
- else
- SubKey = hash_value(Gep);
- } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
- !isa<ConstantInt>(I->getOperand(1))) {
- // Do not try to vectorize instructions with potentially high cost.
- SubKey = hash_value(I);
- } else {
- SubKey = hash_value(I->getOpcode());
- }
- Key = hash_combine(hash_value(I->getParent()), Key);
- }
- return std::make_pair(Key, SubKey);
-}
-
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
const EdgeInfo &UserTreeIdx) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -6470,12 +6393,6 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
}
}
- if (UsedTEs.empty()) {
- assert(all_of(TE->Scalars, UndefValue::classof) &&
- "Expected vector of undefs only.");
- return None;
- }
-
unsigned VF = 0;
if (UsedTEs.size() == 1) {
// Try to find the perfect match in another gather node at first.
@@ -9281,16 +9198,15 @@ class HorizontalReduction {
using ReductionOpsType = SmallVector<Value *, 16>;
using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
ReductionOpsListType ReductionOps;
- /// List of possibly reduced values.
- SmallVector<SmallVector<Value *>> ReducedVals;
- /// Maps reduced value to the corresponding reduction operation.
- DenseMap<Value *, Instruction *> ReducedValsToOps;
+ SmallVector<Value *, 32> ReducedVals;
// Use map vector to make stable output.
MapVector<Instruction *, Value *> ExtraArgs;
WeakTrackingVH ReductionRoot;
/// The type of reduction operation.
RecurKind RdxKind;
+ const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max();
+
static bool isCmpSelMinMax(Instruction *I) {
return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
@@ -9334,6 +9250,26 @@ class HorizontalReduction {
return I->getOperand(Index);
}
+ /// Checks if the ParentStackElem.first should be marked as a reduction
+ /// operation with an extra argument or as extra argument itself.
+ void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
+ Value *ExtraArg) {
+ if (ExtraArgs.count(ParentStackElem.first)) {
+ ExtraArgs[ParentStackElem.first] = nullptr;
+ // We ran into something like:
+ // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
+ // The whole ParentStackElem.first should be considered as an extra value
+ // in this case.
+ // Do not perform analysis of remaining operands of ParentStackElem.first
+ // instruction, this whole instruction is an extra argument.
+ ParentStackElem.second = INVALID_OPERAND_INDEX;
+ } else {
+ // We ran into something like:
+ // ParentStackElem.first += ... + ExtraArg + ...
+ ExtraArgs[ParentStackElem.first] = ExtraArg;
+ }
+ }
+
/// Creates reduction operation with the current opcode.
static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
Value *RHS, const Twine &Name, bool UseSelect) {
@@ -9418,7 +9354,7 @@ class HorizontalReduction {
/// Creates reduction operation with the current opcode with the IR flags
/// from \p I.
static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
- Value *RHS, const Twine &Name, Value *I) {
+ Value *RHS, const Twine &Name, Instruction *I) {
auto *SelI = dyn_cast<SelectInst>(I);
Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr);
if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
@@ -9429,10 +9365,8 @@ class HorizontalReduction {
return Op;
}
- static RecurKind getRdxKind(Value *V) {
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return RecurKind::None;
+ static RecurKind getRdxKind(Instruction *I) {
+ assert(I && "Expected instruction for reduction matching");
if (match(I, m_Add(m_Value(), m_Value())))
return RecurKind::Add;
if (match(I, m_Mul(m_Value(), m_Value())))
@@ -9594,9 +9528,7 @@ class HorizontalReduction {
HorizontalReduction() = default;
/// Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst,
- ScalarEvolution &SE, const DataLayout &DL,
- const TargetLibraryInfo &TLI) {
+ bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) {
assert((!Phi || is_contained(Phi->operands(), Inst)) &&
"Phi needs to use the binary operator");
assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||
@@ -9640,121 +9572,88 @@ class HorizontalReduction {
ReductionRoot = Inst;
- // Iterate through all the operands of the possible reduction tree and
- // gather all the reduced values, sorting them by their value id.
- BasicBlock *BB = Inst->getParent();
- bool IsCmpSelMinMax = isCmpSelMinMax(Inst);
- SmallVector<Instruction *> Worklist(1, Inst);
- // Checks if the operands of the \p TreeN instruction are also reduction
- // operations or should be treated as reduced values or an extra argument,
- // which is not part of the reduction.
- auto &&CheckOperands = [this, IsCmpSelMinMax,
- BB](Instruction *TreeN,
- SmallVectorImpl<Value *> &ExtraArgs,
- SmallVectorImpl<Value *> &PossibleReducedVals,
- SmallVectorImpl<Instruction *> &ReductionOps) {
- for (int I = getFirstOperandIndex(TreeN),
- End = getNumberOfOperands(TreeN);
- I < End; ++I) {
- Value *EdgeVal = getRdxOperand(TreeN, I);
- ReducedValsToOps.try_emplace(EdgeVal, TreeN);
- auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
- // Edge has wrong parent - mark as an extra argument.
- if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
- !hasSameParent(EdgeInst, BB)) {
- ExtraArgs.push_back(EdgeVal);
- continue;
- }
- // If the edge is not an instruction, or it is
diff erent from the main
- // reduction opcode or has too many uses - possible reduced value.
- if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
- !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
- !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {
- PossibleReducedVals.push_back(EdgeVal);
- continue;
- }
- ReductionOps.push_back(EdgeInst);
- }
- };
- // Try to regroup reduced values so that it gets more profitable to try to
- // reduce them. Values are grouped by their value ids, instructions - by
- // instruction op id and/or alternate op id, plus do extra analysis for
- // loads (grouping them by the distabce between pointers) and cmp
- // instructions (grouping them by the predicate).
- MapVector<size_t, MapVector<size_t, SmallVector<Value *>>>
- PossibleReducedVals;
+ // The opcode for leaf values that we perform a reduction on.
+ // For example: load(x) + load(y) + load(z) + fptoui(w)
+ // The leaf opcode for 'w' does not match, so we don't include it as a
+ // potential candidate for the reduction.
+ unsigned LeafOpcode = 0;
+
+ // Post-order traverse the reduction tree starting at Inst. We only handle
+ // true trees containing binary operators or selects.
+ SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
+ Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst)));
initReductionOps(Inst);
- while (!Worklist.empty()) {
- Instruction *TreeN = Worklist.pop_back_val();
- SmallVector<Value *> Args;
- SmallVector<Value *> PossibleRedVals;
- SmallVector<Instruction *> PossibleReductionOps;
- CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
- // If too many extra args - mark the instruction itself as a reduction
- // value, not a reduction operation.
- if (Args.size() < 2) {
- addReductionOps(TreeN);
- // Add extra args.
- if (!Args.empty()) {
- assert(Args.size() == 1 && "Expected only single argument.");
- ExtraArgs[TreeN] = Args.front();
+ while (!Stack.empty()) {
+ Instruction *TreeN = Stack.back().first;
+ unsigned EdgeToVisit = Stack.back().second++;
+ const RecurKind TreeRdxKind = getRdxKind(TreeN);
+ bool IsReducedValue = TreeRdxKind != RdxKind;
+
+ // Postorder visit.
+ if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) {
+ if (IsReducedValue)
+ ReducedVals.push_back(TreeN);
+ else {
+ auto ExtraArgsIter = ExtraArgs.find(TreeN);
+ if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) {
+ // Check if TreeN is an extra argument of its parent operation.
+ if (Stack.size() <= 1) {
+ // TreeN can't be an extra argument as it is a root reduction
+ // operation.
+ return false;
+ }
+ // Yes, TreeN is an extra argument, do not add it to a list of
+ // reduction operations.
+ // Stack[Stack.size() - 2] always points to the parent operation.
+ markExtraArg(Stack[Stack.size() - 2], TreeN);
+ ExtraArgs.erase(TreeN);
+ } else
+ addReductionOps(TreeN);
}
- // Add reduction values. The values are sorted for better vectorization
- // results.
- for (Value *V : PossibleRedVals) {
- size_t Key, Idx;
- std::tie(Key, Idx) = generateKeySubkey(
- V, &TLI,
- [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
- for (const auto &LoadData : PossibleReducedVals[Key]) {
- auto *RLI = cast<LoadInst>(LoadData.second.front());
- if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
- LI->getType(), LI->getPointerOperand(),
- DL, SE, /*StrictCheck=*/true))
- return hash_value(RLI->getPointerOperand());
- }
- return hash_value(LI->getPointerOperand());
- },
- /*AllowAlternate=*/false);
- PossibleReducedVals[Key][Idx].push_back(V);
+ // Retract.
+ Stack.pop_back();
+ continue;
+ }
+
+ // Visit operands.
+ Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit);
+ auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
+ if (!EdgeInst) {
+ // Edge value is not a reduction instruction or a leaf instruction.
+ // (It may be a constant, function argument, or something else.)
+ markExtraArg(Stack.back(), EdgeVal);
+ continue;
+ }
+ RecurKind EdgeRdxKind = getRdxKind(EdgeInst);
+ // Continue analysis if the next operand is a reduction operation or
+ // (possibly) a leaf value. If the leaf value opcode is not set,
+ // the first met operation != reduction operation is considered as the
+ // leaf opcode.
+ // Only handle trees in the current basic block.
+ // Each tree node needs to have minimal number of users except for the
+ // ultimate reduction.
+ const bool IsRdxInst = EdgeRdxKind == RdxKind;
+ if (EdgeInst != Phi && EdgeInst != Inst &&
+ hasSameParent(EdgeInst, Inst->getParent()) &&
+ hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) &&
+ (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) {
+ if (IsRdxInst) {
+ // We need to be able to reassociate the reduction operations.
+ if (!isVectorizable(EdgeRdxKind, EdgeInst)) {
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), EdgeInst);
+ continue;
+ }
+ } else if (!LeafOpcode) {
+ LeafOpcode = EdgeInst->getOpcode();
}
- Worklist.append(PossibleReductionOps.begin(),
- PossibleReductionOps.end());
- } else {
- size_t Key, Idx;
- std::tie(Key, Idx) = generateKeySubkey(
- TreeN, &TLI,
- [&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {
- for (const auto &LoadData : PossibleReducedVals[Key]) {
- auto *RLI = cast<LoadInst>(LoadData.second.front());
- if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
- LI->getType(), LI->getPointerOperand(), DL,
- SE, /*StrictCheck=*/true))
- return hash_value(RLI->getPointerOperand());
- }
- return hash_value(LI->getPointerOperand());
- },
- /*AllowAlternate=*/false);
- PossibleReducedVals[Key][Idx].push_back(TreeN);
- }
- }
- auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
- // Sort values by the total number of values kinds to start the reduction
- // from the longest possible reduced values sequences.
- for (auto &PossibleReducedVals : PossibleReducedValsVect) {
- auto PossibleRedVals = PossibleReducedVals.second.takeVector();
- stable_sort(PossibleRedVals, [](const auto &P1, const auto &P2) {
- return P1.second.size() > P2.second.size();
- });
- ReducedVals.emplace_back();
- for (auto &Data : PossibleRedVals)
- ReducedVals.back().append(Data.second.rbegin(), Data.second.rend());
- }
- // Sort the reduced values by number of same/alternate opcode and/or pointer
- // operand.
- stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
- return P1.size() > P2.size();
- });
+ Stack.push_back(
+ std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst)));
+ continue;
+ }
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), EdgeInst);
+ }
return true;
}
@@ -9763,29 +9662,34 @@ class HorizontalReduction {
// If there are a sufficient number of reduction values, reduce
// to a nearby power-of-2. We can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.
- unsigned NumReducedVals = std::accumulate(
- ReducedVals.begin(), ReducedVals.end(), 0,
- [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); });
+ unsigned NumReducedVals = ReducedVals.size();
if (NumReducedVals < 4)
return nullptr;
+ // Intersect the fast-math-flags from all reduction operations.
+ FastMathFlags RdxFMF;
+ RdxFMF.set();
+ for (ReductionOpsType &RdxOp : ReductionOps) {
+ for (Value *RdxVal : RdxOp) {
+ if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))
+ RdxFMF &= FPMO->getFastMathFlags();
+ }
+ }
+
IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
+ Builder.setFastMathFlags(RdxFMF);
- // Track the reduced values in case if they are replaced by extractelement
- // because of the vectorization.
- DenseMap<Value *, WeakTrackingVH> TrackedVals;
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
// The same extra argument may be used several times, so log each attempt
// to use it.
for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
assert(Pair.first && "DebugLoc must be set.");
ExternallyUsedValues[Pair.second].push_back(Pair.first);
- TrackedVals.try_emplace(Pair.second, Pair.second);
}
// The compare instruction of a min/max is the insertion point for new
// instructions and may be replaced with a new compare instruction.
- auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
+ auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
assert(isa<SelectInst>(RdxRootInst) &&
"Expected min/max reduction to have select root instruction");
Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
@@ -9797,246 +9701,141 @@ class HorizontalReduction {
// The reduction root is used as the insertion point for new instructions,
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
- SmallVector<Value *> IgnoreList;
- for (ReductionOpsType &RdxOps : ReductionOps)
- for (Value *RdxOp : RdxOps) {
- if (!RdxOp)
- continue;
- IgnoreList.push_back(RdxOp);
- }
-
- // Need to track reduced vals, they may be changed during vectorization of
- // subvectors.
- for (ArrayRef<Value *> Candidates : ReducedVals)
- for (Value *V : Candidates)
- TrackedVals.try_emplace(V, V);
-
- DenseMap<Value *, unsigned> VectorizedVals;
- Value *VectorizedTree = nullptr;
- // Try to vectorize elements based on their type.
- for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
- ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
- InstructionsState S = getSameOpcode(OrigReducedVals);
- SmallVector<Value *> Candidates;
- DenseMap<Value *, Value *> TrackedToOrig;
- for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
- Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
- // Check if the reduction value was not overriden by the extractelement
- // instruction because of the vectorization and exclude it, if it is not
- // compatible with other values.
- if (auto *Inst = dyn_cast<Instruction>(RdxVal))
- if (isVectorLikeInstWithConstOps(Inst) &&
- (!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))
- continue;
- Candidates.push_back(RdxVal);
- TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
- }
- bool ShuffledExtracts = false;
- // Try to handle shuffled extractelements.
- if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
- I + 1 < E) {
- InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]);
- if (NextS.getOpcode() == Instruction::ExtractElement &&
- !NextS.isAltShuffle()) {
- SmallVector<Value *> CommonCandidates(Candidates);
- for (Value *RV : ReducedVals[I + 1]) {
- Value *RdxVal = TrackedVals.find(RV)->second;
- // Check if the reduction value was not overriden by the
- // extractelement instruction because of the vectorization and
- // exclude it, if it is not compatible with other values.
- if (auto *Inst = dyn_cast<Instruction>(RdxVal))
- if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
- continue;
- CommonCandidates.push_back(RdxVal);
- TrackedToOrig.try_emplace(RdxVal, RV);
- }
- SmallVector<int> Mask;
- if (isFixedVectorShuffle(CommonCandidates, Mask)) {
- ++I;
- Candidates.swap(CommonCandidates);
- ShuffledExtracts = true;
- }
+ SmallVector<Value *, 16> IgnoreList;
+ for (ReductionOpsType &RdxOp : ReductionOps)
+ IgnoreList.append(RdxOp.begin(), RdxOp.end());
+
+ unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+ if (NumReducedVals > ReduxWidth) {
+ // In the loop below, we are building a tree based on a window of
+ // 'ReduxWidth' values.
+ // If the operands of those values have common traits (compare predicate,
+ // constant operand, etc), then we want to group those together to
+ // minimize the cost of the reduction.
+
+ // TODO: This should be extended to count common operands for
+ // compares and binops.
+
+ // Step 1: Count the number of times each compare predicate occurs.
+ SmallDenseMap<unsigned, unsigned> PredCountMap;
+ for (Value *RdxVal : ReducedVals) {
+ CmpInst::Predicate Pred;
+ if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
+ ++PredCountMap[Pred];
+ }
+ // Step 2: Sort the values so the most common predicates come first.
+ stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
+ CmpInst::Predicate PredA, PredB;
+ if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
+ match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
+ return PredCountMap[PredA] > PredCountMap[PredB];
}
- }
- unsigned NumReducedVals = Candidates.size();
- if (NumReducedVals < 4)
- continue;
-
- unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
- unsigned Start = 0;
- unsigned Pos = Start;
- // Restarts vectorization attempt with lower vector factor.
- auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals]() {
- if (ReduxWidth == 4 || Pos >= NumReducedVals - ReduxWidth + 1) {
- ++Start;
- ReduxWidth = PowerOf2Floor(NumReducedVals - Start) * 2;
- }
- Pos = Start;
- ReduxWidth /= 2;
- };
- while (Pos < NumReducedVals - ReduxWidth + 1 && ReduxWidth >= 4) {
- ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
- V.buildTree(VL, IgnoreList);
- if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
- AdjustReducedVals();
- continue;
- }
- if (V.isLoadCombineReductionCandidate(RdxKind)) {
- AdjustReducedVals();
- continue;
- }
- V.reorderTopToBottom();
- // No need to reorder the root node at all.
- V.reorderBottomToTop(/*IgnoreReorder=*/true);
- // Keep extracted other reduction values, if they are used in the
- // vectorization trees.
- BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
- ExternallyUsedValues);
- for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
- if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
- continue;
- for_each(ReducedVals[Cnt],
- [&LocalExternallyUsedValues, &TrackedVals](Value *V) {
- if (isa<Instruction>(V))
- LocalExternallyUsedValues[TrackedVals[V]];
- });
- }
- for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
- if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
- continue;
- if (VectorizedVals.count(Candidates[Cnt]))
- continue;
- LocalExternallyUsedValues[Candidates[Cnt]];
- }
- V.buildExternalUses(LocalExternallyUsedValues);
+ return false;
+ });
+ }
- V.computeMinimumValueSizes();
+ Value *VectorizedTree = nullptr;
+ unsigned i = 0;
+ while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
+ ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
+ V.buildTree(VL, IgnoreList);
+ if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true))
+ break;
+ if (V.isLoadCombineReductionCandidate(RdxKind))
+ break;
+ V.reorderTopToBottom();
+ V.reorderBottomToTop(/*IgnoreReorder=*/true);
+ V.buildExternalUses(ExternallyUsedValues);
+
+ // For a poison-safe boolean logic reduction, do not replace select
+ // instructions with logic ops. All reduced values will be frozen (see
+ // below) to prevent leaking poison.
+ if (isa<SelectInst>(ReductionRoot) &&
+ isBoolLogicOp(cast<Instruction>(ReductionRoot)) &&
+ NumReducedVals != ReduxWidth)
+ break;
- // Intersect the fast-math-flags from all reduction operations.
- FastMathFlags RdxFMF;
- RdxFMF.set();
- for (Value *RdxVal : VL) {
- if (auto *FPMO = dyn_cast<FPMathOperator>(
- ReducedValsToOps.find(RdxVal)->second))
- RdxFMF &= FPMO->getFastMathFlags();
- }
- // Estimate cost.
- InstructionCost TreeCost = V.getTreeCost(VL);
- InstructionCost ReductionCost =
- getReductionCost(TTI, VL[0], ReduxWidth, RdxFMF);
- InstructionCost Cost = TreeCost + ReductionCost;
- if (!Cost.isValid()) {
- LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
- return nullptr;
- }
- if (Cost >= -SLPCostThreshold) {
- V.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "HorSLPNotBeneficial",
- ReducedValsToOps.find(VL[0])->second)
- << "Vectorizing horizontal reduction is possible"
- << "but not beneficial with cost " << ore::NV("Cost", Cost)
- << " and threshold "
- << ore::NV("Threshold", -SLPCostThreshold);
- });
- AdjustReducedVals();
- continue;
- }
+ V.computeMinimumValueSizes();
- LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
- << Cost << ". (HorRdx)\n");
+ // Estimate cost.
+ InstructionCost TreeCost =
+ V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth));
+ InstructionCost ReductionCost =
+ getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF);
+ InstructionCost Cost = TreeCost + ReductionCost;
+ if (!Cost.isValid()) {
+ LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
+ return nullptr;
+ }
+ if (Cost >= -SLPCostThreshold) {
V.getORE()->emit([&]() {
- return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
- ReducedValsToOps.find(VL[0])->second)
- << "Vectorized horizontal reduction with cost "
- << ore::NV("Cost", Cost) << " and with tree size "
- << ore::NV("TreeSize", V.getTreeSize());
+ return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
+ cast<Instruction>(VL[0]))
+ << "Vectorizing horizontal reduction is possible"
+ << "but not beneficial with cost " << ore::NV("Cost", Cost)
+ << " and threshold "
+ << ore::NV("Threshold", -SLPCostThreshold);
});
+ break;
+ }
- Builder.setFastMathFlags(RdxFMF);
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+ << Cost << ". (HorRdx)\n");
+ V.getORE()->emit([&]() {
+ return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
+ cast<Instruction>(VL[0]))
+ << "Vectorized horizontal reduction with cost "
+ << ore::NV("Cost", Cost) << " and with tree size "
+ << ore::NV("TreeSize", V.getTreeSize());
+ });
- // Vectorize a tree.
- Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues);
+ // Vectorize a tree.
+ DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
+ Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
- // Emit a reduction. If the root is a select (min/max idiom), the insert
- // point is the compare condition of that select.
- Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
- if (isCmpSelMinMax(RdxRootInst))
- Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst));
- else
- Builder.SetInsertPoint(RdxRootInst);
+ // Emit a reduction. If the root is a select (min/max idiom), the insert
+ // point is the compare condition of that select.
+ Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
+ if (isCmpSelMinMax(RdxRootInst))
+ Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
+ else
+ Builder.SetInsertPoint(RdxRootInst);
- // To prevent poison from leaking across what used to be sequential,
- // safe, scalar boolean logic operations, the reduction operand must be
- // frozen.
- if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
- VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
+ // To prevent poison from leaking across what used to be sequential, safe,
+ // scalar boolean logic operations, the reduction operand must be frozen.
+ if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
+ VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
- Value *ReducedSubTree =
- emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+ Value *ReducedSubTree =
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
- if (!VectorizedTree) {
- // Initialize the final value in the reduction.
- VectorizedTree = ReducedSubTree;
- } else {
- // Update the final value in the reduction.
- Builder.SetCurrentDebugLocation(
- cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
- ReducedSubTree, "op.rdx", ReductionOps);
- }
- // Count vectorized reduced values to exclude them from final reduction.
- for (Value *V : VL)
- ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0)
- .first->getSecond();
- Pos += ReduxWidth;
- Start = Pos;
- ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);
+ if (!VectorizedTree) {
+ // Initialize the final value in the reduction.
+ VectorizedTree = ReducedSubTree;
+ } else {
+ // Update the final value in the reduction.
+ Builder.SetCurrentDebugLocation(Loc);
+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+ ReducedSubTree, "op.rdx", ReductionOps);
}
+ i += ReduxWidth;
+ ReduxWidth = PowerOf2Floor(NumReducedVals - i);
}
+
if (VectorizedTree) {
// Finish the reduction.
- // Need to add extra arguments and not vectorized possible reduction
- // values.
- for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
- ArrayRef<Value *> Candidates = ReducedVals[I];
- for (Value *RdxVal : Candidates) {
- auto It = VectorizedVals.find(RdxVal);
- if (It != VectorizedVals.end()) {
- --It->getSecond();
- if (It->second == 0)
- VectorizedVals.erase(It);
- continue;
- }
- Instruction *RedOp = ReducedValsToOps.find(RdxVal)->second;
- Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
- ReductionOpsListType Ops;
- if (auto *Sel = dyn_cast<SelectInst>(RedOp))
- Ops.emplace_back().push_back(Sel->getCondition());
- Ops.emplace_back().push_back(RedOp);
- Value *StableRdxVal = RdxVal;
- auto TVIt = TrackedVals.find(RdxVal);
- if (TVIt != TrackedVals.end())
- StableRdxVal = TVIt->second;
-
- VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
- StableRdxVal, "op.rdx", RedOp);
- }
+ for (; i < NumReducedVals; ++i) {
+ auto *I = cast<Instruction>(ReducedVals[i]);
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ VectorizedTree =
+ createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
}
for (auto &Pair : ExternallyUsedValues) {
// Add each externally used value to the final reduction.
for (auto *I : Pair.second) {
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- ReductionOpsListType Ops;
- if (auto *Sel = dyn_cast<SelectInst>(I))
- Ops.emplace_back().push_back(Sel->getCondition());
- Ops.emplace_back().push_back(I);
- Value *StableRdxVal = Pair.first;
- auto TVIt = TrackedVals.find(Pair.first);
- if (TVIt != TrackedVals.end())
- StableRdxVal = TVIt->second;
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
- StableRdxVal, "op.rdx", Ops);
+ Pair.first, "op.extra", I);
}
}
@@ -10327,8 +10126,7 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
/// performed.
static bool tryToVectorizeHorReductionOrInstOperands(
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL,
- const TargetLibraryInfo &TLI,
+ TargetTransformInfo *TTI,
const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
if (!ShouldVectorizeHor)
return false;
@@ -10355,14 +10153,13 @@ static bool tryToVectorizeHorReductionOrInstOperands(
SmallPtrSet<Value *, 8> VisitedInstrs;
SmallVector<WeakTrackingVH> PostponedInsts;
bool Res = false;
- auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst,
- Value *&B0,
- Value *&B1) -> Value * {
+ auto &&TryToReduce = [TTI, &P, &R](Instruction *Inst, Value *&B0,
+ Value *&B1) -> Value * {
bool IsBinop = matchRdxBop(Inst, B0, B1);
bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
if (IsBinop || IsSelect) {
HorizontalReduction HorRdx;
- if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI))
+ if (HorRdx.matchAssociativeReduction(P, Inst))
return HorRdx.tryToReduce(R, TTI);
}
return nullptr;
@@ -10445,8 +10242,8 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
return tryToVectorize(I, R);
};
- return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL,
- *TLI, ExtraVectorization);
+ return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
+ ExtraVectorization);
}
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
index 00ac1265b0629..76e0f39429aad 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
@@ -90,16 +90,23 @@ return:
define float @test_merge_anyof_v4sf(<4 x float> %t) {
; CHECK-LABEL: @test_merge_anyof_v4sf(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[T_FR7:%.*]] = freeze <4 x float> [[T:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR7]], zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[T_FR7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i1> [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
-; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP3]], 0
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR7]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[SHIFT]], [[T_FR7]]
-; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP4]], i64 0
-; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[ADD]], float 0.000000e+00
+; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T:%.*]]
+; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i4 [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[T_FR]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
+; CHECK-NEXT: [[OR_COND3:%.*]] = or i1 [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
+; CHECK-NEXT: [[OR_COND4:%.*]] = or i1 [[OR_COND3]], [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
+; CHECK-NEXT: [[OR_COND5:%.*]] = or i1 [[OR_COND4]], [[TMP6]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
+; CHECK-NEXT: [[OR_COND6:%.*]] = or i1 [[OR_COND5]], [[TMP7]]
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[SHIFT]], [[T_FR]]
+; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP8]], i64 0
+; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[ADD]]
; CHECK-NEXT: ret float [[RETVAL_0]]
;
entry:
@@ -412,16 +419,24 @@ return:
define float @test_merge_anyof_v4si(<4 x i32> %t) {
; CHECK-LABEL: @test_merge_anyof_v4si(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[T_FR7:%.*]] = freeze <4 x i32> [[T:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[T_FR7]], <i32 -256, i32 -256, i32 -256, i32 -256>
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], <i32 -255, i32 -255, i32 -255, i32 -255>
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i1> [[TMP1]] to i4
-; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP2]], 0
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR7]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T_FR7]]
-; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x i32> [[T:%.*]]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i32> [[T_FR]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i4 [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[T_FR]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
+; CHECK-NEXT: [[OR_COND3:%.*]] = or i1 [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
+; CHECK-NEXT: [[OR_COND4:%.*]] = or i1 [[OR_COND3]], [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
+; CHECK-NEXT: [[OR_COND5:%.*]] = or i1 [[OR_COND4]], [[TMP6]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
+; CHECK-NEXT: [[OR_COND6:%.*]] = or i1 [[OR_COND5]], [[TMP7]]
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T_FR]]
+; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP8]], i64 0
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[ADD]] to float
-; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[CONV]], float 0.000000e+00
+; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[CONV]]
; CHECK-NEXT: ret float [[RETVAL_0]]
;
entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index 4064c3a5399f6..df19a823d89d2 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -41,7 +41,7 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: @ext_ext_partial_add_reduction_and_extra_add_v4i32(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 4, i32 2, i32 5, i32 6>
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[TMP2]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
index 4142e3199f05b..01d743fcbfe97 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -15,10 +15,10 @@ define void @PR28330(i32 %n) {
; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; DEFAULT-NEXT: br label [[FOR_BODY:%.*]]
; DEFAULT: for.body:
-; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; DEFAULT-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; DEFAULT-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; DEFAULT-NEXT: [[OP_RDX]] = add i32 [[TMP3]], [[P17]]
+; DEFAULT-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
; DEFAULT-NEXT: br label [[FOR_BODY]]
;
; GATHER-LABEL: @PR28330(
@@ -27,10 +27,10 @@ define void @PR28330(i32 %n) {
; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; GATHER-NEXT: br label [[FOR_BODY:%.*]]
; GATHER: for.body:
-; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; GATHER-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; GATHER-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; GATHER-NEXT: [[OP_RDX]] = add i32 [[TMP3]], [[P17]]
+; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
; GATHER-NEXT: br label [[FOR_BODY]]
;
; MAX-COST-LABEL: @PR28330(
@@ -39,10 +39,10 @@ define void @PR28330(i32 %n) {
; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; MAX-COST-NEXT: br label [[FOR_BODY:%.*]]
; MAX-COST: for.body:
-; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; MAX-COST-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; MAX-COST-NEXT: [[OP_RDX]] = add i32 [[TMP3]], [[P17]]
+; MAX-COST-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
; MAX-COST-NEXT: br label [[FOR_BODY]]
;
entry:
@@ -92,10 +92,10 @@ define void @PR32038(i32 %n) {
; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; DEFAULT-NEXT: br label [[FOR_BODY:%.*]]
; DEFAULT: for.body:
-; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; DEFAULT-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; DEFAULT-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; DEFAULT-NEXT: [[OP_RDX]] = add i32 [[TMP3]], -5
+; DEFAULT-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5
; DEFAULT-NEXT: br label [[FOR_BODY]]
;
; GATHER-LABEL: @PR32038(
@@ -104,10 +104,10 @@ define void @PR32038(i32 %n) {
; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; GATHER-NEXT: br label [[FOR_BODY:%.*]]
; GATHER: for.body:
-; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; GATHER-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; GATHER-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; GATHER-NEXT: [[OP_RDX]] = add i32 [[TMP3]], -5
+; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5
; GATHER-NEXT: br label [[FOR_BODY]]
;
; MAX-COST-LABEL: @PR32038(
@@ -116,10 +116,10 @@ define void @PR32038(i32 %n) {
; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; MAX-COST-NEXT: br label [[FOR_BODY:%.*]]
; MAX-COST: for.body:
-; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; MAX-COST-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; MAX-COST-NEXT: [[OP_RDX]] = add i32 [[TMP3]], -5
+; MAX-COST-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5
; MAX-COST-NEXT: br label [[FOR_BODY]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
index 12a2cd2a6834f..24c69fb667272 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -204,10 +204,10 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-LABEL: @reduction_v4i32(
-; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
-; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
+; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 7, i32 2>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 6, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 03a29f48a739e..962f8ca42bf86 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -204,10 +204,10 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-LABEL: @reduction_v4i32(
-; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
-; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
+; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 7, i32 2>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 6, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
index 0d20a0e01d116..6817269f394dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
@@ -8,7 +8,7 @@ define void @mainTest(i32* %ptr) #0 {
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32* [[PTR:%.*]], null
; CHECK-NEXT: br i1 [[CMP]], label [[LOOP:%.*]], label [[BAIL_OUT:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX3:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA3:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[PTR]] to <4 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
@@ -17,10 +17,10 @@ define void @mainTest(i32* %ptr) #0 {
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP3]] to i64
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP7]], [[TMP4]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[TMP3]]
-; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], [[TMP2]]
-; CHECK-NEXT: [[OP_RDX3]] = add i32 [[OP_RDX2]], 1
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP7]], 1
+; CHECK-NEXT: [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], [[TMP4]]
+; CHECK-NEXT: [[OP_EXTRA2:%.*]] = add i32 [[OP_EXTRA1]], [[TMP3]]
+; CHECK-NEXT: [[OP_EXTRA3]] = add i32 [[OP_EXTRA2]], [[TMP2]]
; CHECK-NEXT: br label [[LOOP]]
; CHECK: bail_out:
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
index 54559149c34bf..d15494e092c25 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@@ -7,20 +7,20 @@ define void @test() #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX1:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA1:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP3:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], <i64 2, i64 3, i64 1, i64 0>
+; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], <i64 3, i64 2, i64 1, i64 0>
; CHECK-NEXT: [[TMP3]] = extractelement <4 x i64> [[TMP2]], i32 3
; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[TMP4]], 32
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP2]]
; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32>
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP7]], [[TMP3]]
-; CHECK-NEXT: [[OP_RDX1]] = add i64 [[OP_RDX]], 0
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i64 [[TMP7]], 0
+; CHECK-NEXT: [[OP_EXTRA1]] = add i64 [[OP_EXTRA]], [[TMP3]]
; CHECK-NEXT: br label [[LOOP]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index 87db4dbd4da10..7668747a75ace 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -5,98 +5,98 @@
define void @Test(i32) {
; CHECK-LABEL: @Test(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP0]], i32 4
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP0]], i32 5
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP0]], i32 6
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP0]], i32 7
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP0]], i32 4
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP0]], i32 5
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP0]], i32 6
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP0]], i32 7
-; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP0]], i32 8
-; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP0]], i32 9
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP0]], i32 10
-; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP0]], i32 11
-; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP0]], i32 12
-; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP0]], i32 13
-; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP0]], i32 14
-; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP0]], i32 15
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[TMP25:%.*]] = phi <2 x i32> [ [[TMP36:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
-; CHECK-NEXT: [[TMP27:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
-; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]])
-; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP28]], [[TMP29]]
-; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP27]])
-; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP30]]
-; CHECK-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]]
-; CHECK-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[TMP0]]
-; CHECK-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX3]], [[TMP0]]
-; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX4]], i32 0
-; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0
-; CHECK-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP26]], i32 1
-; CHECK-NEXT: [[TMP34:%.*]] = and <2 x i32> [[TMP31]], [[TMP33]]
-; CHECK-NEXT: [[TMP35:%.*]] = add <2 x i32> [[TMP31]], [[TMP33]]
-; CHECK-NEXT: [[TMP36]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> [[TMP35]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0:%.*]]
+; CHECK-NEXT: [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA2]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
+; CHECK-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_EXTRA26]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = and <2 x i32> [[TMP5]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP5]], [[TMP7]]
+; CHECK-NEXT: [[TMP10]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: br label [[LOOP]]
;
; FORCE_REDUCTION-LABEL: @Test(
; FORCE_REDUCTION-NEXT: entry:
-; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
-; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP0]], i32 1
-; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP0]], i32 2
-; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP0]], i32 3
-; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP0]], i32 4
-; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP0]], i32 5
-; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP0]], i32 6
-; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP0]], i32 7
-; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0
-; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[TMP0]], i32 1
-; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 2
-; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[TMP0]], i32 3
-; FORCE_REDUCTION-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP0]], i32 4
-; FORCE_REDUCTION-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP0]], i32 5
-; FORCE_REDUCTION-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP0]], i32 6
-; FORCE_REDUCTION-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP0]], i32 7
-; FORCE_REDUCTION-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP0]], i32 8
-; FORCE_REDUCTION-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP0]], i32 9
-; FORCE_REDUCTION-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP0]], i32 10
-; FORCE_REDUCTION-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP0]], i32 11
-; FORCE_REDUCTION-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP0]], i32 12
-; FORCE_REDUCTION-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP0]], i32 13
-; FORCE_REDUCTION-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP0]], i32 14
-; FORCE_REDUCTION-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP0]], i32 15
; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]]
; FORCE_REDUCTION: loop:
-; FORCE_REDUCTION-NEXT: [[TMP25:%.*]] = phi <2 x i32> [ [[TMP36:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
-; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; FORCE_REDUCTION-NEXT: [[TMP26:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
-; FORCE_REDUCTION-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]])
-; FORCE_REDUCTION-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]])
-; FORCE_REDUCTION-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP26]])
-; FORCE_REDUCTION-NEXT: [[OP_RDX13:%.*]] = and i32 [[TMP29]], [[TMP0]]
-; FORCE_REDUCTION-NEXT: [[OP_RDX14:%.*]] = and i32 [[OP_RDX13]], [[TMP0]]
-; FORCE_REDUCTION-NEXT: [[OP_RDX15:%.*]] = and i32 [[OP_RDX14]], [[TMP0]]
-; FORCE_REDUCTION-NEXT: [[OP_RDX16:%.*]] = and i32 [[OP_RDX15]], [[TMP27]]
-; FORCE_REDUCTION-NEXT: [[OP_RDX17:%.*]] = and i32 [[OP_RDX16]], [[TMP28]]
-; FORCE_REDUCTION-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX17]], i32 0
-; FORCE_REDUCTION-NEXT: [[TMP31:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
-; FORCE_REDUCTION-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> poison, i32 [[TMP31]], i32 0
-; FORCE_REDUCTION-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP31]], i32 1
-; FORCE_REDUCTION-NEXT: [[TMP34:%.*]] = and <2 x i32> [[TMP30]], [[TMP33]]
-; FORCE_REDUCTION-NEXT: [[TMP35:%.*]] = add <2 x i32> [[TMP30]], [[TMP33]]
-; FORCE_REDUCTION-NEXT: [[TMP36]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> [[TMP35]], <2 x i32> <i32 0, i32 3>
+; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP12:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
+; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
+; FORCE_REDUCTION-NEXT: [[VAL_20:%.*]] = add i32 [[TMP2]], 1496
+; FORCE_REDUCTION-NEXT: [[VAL_34:%.*]] = add i32 [[TMP2]], 8555
+; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP3]])
+; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]]
+; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP6]], [[TMP0:%.*]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA2]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP2]]
+; FORCE_REDUCTION-NEXT: [[VAL_39:%.*]] = add i32 [[TMP2]], 12529
+; FORCE_REDUCTION-NEXT: [[VAL_40:%.*]] = and i32 [[OP_EXTRA27]], [[VAL_39]]
+; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[TMP2]], 13685
+; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_40]], i32 0
+; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
+; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[VAL_41]], i32 0
+; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = and <2 x i32> [[TMP8]], [[TMP9]]
+; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP8]], [[TMP9]]
+; FORCE_REDUCTION-NEXT: [[TMP12]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> <i32 0, i32 3>
; FORCE_REDUCTION-NEXT: br label [[LOOP]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
index ba0585f11bade..f878bda14ad84 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -4,20 +4,20 @@
define void @mainTest(i32 %param, i32 * %vals, i32 %len) {
; CHECK-LABEL: @mainTest(
; CHECK-NEXT: bci_15.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 31>, i32 [[PARAM:%.*]], i32 0
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 31, i32 poison>, i32 [[PARAM:%.*]], i32 1
; CHECK-NEXT: br label [[BCI_15:%.*]]
; CHECK: bci_15:
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ]
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 -1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 -1>
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15
; CHECK-NEXT: store atomic i32 [[TMP4]], i32* [[VALS:%.*]] unordered, align 4
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP5]], [[TMP2]]
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[V44:%.*]] = add i32 [[TMP2]], 16
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0
-; CHECK-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[V44]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[V44]], i32 0
+; CHECK-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1
; CHECK-NEXT: br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]]
; CHECK: loopexit:
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
index 2f648016361fc..e911d9ca6d100 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
@@ -60,7 +60,7 @@ define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
define i8 @i(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @i(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 3, i32 0, i32 6, i32 5>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP2]])
; CHECK-NEXT: ret i8 [[TMP3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
index 4af7c0bed1733..e3ee84f84f7f1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -60,7 +60,7 @@ define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
define i8 @i(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @i(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 3, i32 0, i32 6, i32 5>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP2]])
; CHECK-NEXT: ret i8 [[TMP3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 1ea59c3949fe6..d9ca7d565baac 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -13,30 +13,58 @@ define float @baz() {
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
-; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
-; CHECK-NEXT: store float [[OP_RDX1]], float* @res, align 4
-; CHECK-NEXT: ret float [[OP_RDX1]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
+; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x float> poison, float [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP9]], i32 1
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP5]], i32 2
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP4]], i32 3
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP10]], i32 4
+; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP9]], i32 5
+; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP5]], i32 6
+; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP4]], i32 7
+; CHECK-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP18]])
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP19]], [[CONV]]
+; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; CHECK-NEXT: store float [[OP_EXTRA1]], float* @res, align 4
+; CHECK-NEXT: ret float [[OP_EXTRA1]]
;
; THRESHOLD-LABEL: @baz(
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4
; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
-; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
-; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]])
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
-; THRESHOLD-NEXT: store float [[OP_RDX1]], float* @res, align 4
-; THRESHOLD-NEXT: ret float [[OP_RDX1]]
+; THRESHOLD-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; THRESHOLD-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; THRESHOLD-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; THRESHOLD-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
+; THRESHOLD-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
+; THRESHOLD-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
+; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; THRESHOLD-NEXT: [[TMP11:%.*]] = insertelement <8 x float> poison, float [[TMP10]], i32 0
+; THRESHOLD-NEXT: [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP9]], i32 1
+; THRESHOLD-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[TMP5]], i32 2
+; THRESHOLD-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP4]], i32 3
+; THRESHOLD-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP10]], i32 4
+; THRESHOLD-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[TMP9]], i32 5
+; THRESHOLD-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP5]], i32 6
+; THRESHOLD-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[TMP4]], i32 7
+; THRESHOLD-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP18]])
+; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP19]], [[CONV]]
+; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT: store float [[OP_EXTRA1]], float* @res, align 4
+; THRESHOLD-NEXT: ret float [[OP_EXTRA1]]
;
entry:
%0 = load i32, i32* @n, align 4
@@ -79,10 +107,10 @@ define float @bazz() {
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
-; CHECK-NEXT: store float [[OP_RDX1]], float* @res, align 4
-; CHECK-NEXT: ret float [[OP_RDX1]]
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
+; CHECK-NEXT: store float [[OP_EXTRA1]], float* @res, align 4
+; CHECK-NEXT: ret float [[OP_EXTRA1]]
;
; THRESHOLD-LABEL: @bazz(
; THRESHOLD-NEXT: entry:
@@ -95,10 +123,10 @@ define float @bazz() {
; THRESHOLD-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
-; THRESHOLD-NEXT: store float [[OP_RDX1]], float* @res, align 4
-; THRESHOLD-NEXT: ret float [[OP_RDX1]]
+; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
+; THRESHOLD-NEXT: store float [[OP_EXTRA1]], float* @res, align 4
+; THRESHOLD-NEXT: ret float [[OP_EXTRA1]]
;
entry:
%0 = load i32, i32* @n, align 4
@@ -315,25 +343,25 @@ entry:
define float @f(float* nocapture readonly %x) {
; CHECK-LABEL: @f(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <32 x float>*
-; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
-; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_32]] to <16 x float>*
-; CHECK-NEXT: [[TMP3:%.*]] = load <16 x float>, <16 x float>* [[TMP2]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP3]])
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <16 x float>*
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
+; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
+; CHECK-NEXT: [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]])
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
; CHECK-NEXT: ret float [[OP_RDX]]
;
; THRESHOLD-LABEL: @f(
; THRESHOLD-NEXT: entry:
-; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <32 x float>*
-; THRESHOLD-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32
-; THRESHOLD-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_32]] to <16 x float>*
-; THRESHOLD-NEXT: [[TMP3:%.*]] = load <16 x float>, <16 x float>* [[TMP2]], align 4
-; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]])
-; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP3]])
+; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <16 x float>*
+; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; THRESHOLD-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
+; THRESHOLD-NEXT: [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
+; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP3]])
+; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]])
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
; THRESHOLD-NEXT: ret float [[OP_RDX]]
;
@@ -491,8 +519,8 @@ define float @f1(float* nocapture readonly %x, i32 %a, i32 %b) {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <32 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
-; CHECK-NEXT: ret float [[OP_RDX]]
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; CHECK-NEXT: ret float [[OP_EXTRA]]
;
; THRESHOLD-LABEL: @f1(
; THRESHOLD-NEXT: entry:
@@ -501,8 +529,8 @@ define float @f1(float* nocapture readonly %x, i32 %a, i32 %b) {
; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <32 x float>*
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]])
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
-; THRESHOLD-NEXT: ret float [[OP_RDX]]
+; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; THRESHOLD-NEXT: ret float [[OP_EXTRA]]
;
entry:
%rem = srem i32 %a, %b
@@ -609,50 +637,50 @@ define float @loadadd31(float* nocapture readonly %x) {
; CHECK-LABEL: @loadadd31(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <16 x float>*
-; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
-; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <8 x float>*
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* [[TMP2]], align 4
-; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_24]] to <4 x float>*
-; CHECK-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
-; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_28]], align 4
-; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
-; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_29]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
+; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
+; CHECK-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
+; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
+; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP7]])
+; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP5]])
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
-; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP6]]
-; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP7]]
-; CHECK-NEXT: ret float [[OP_RDX3]]
+; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]]
+; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
+; CHECK-NEXT: ret float [[TMP12]]
;
; THRESHOLD-LABEL: @loadadd31(
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
-; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <16 x float>*
-; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
-; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
-; THRESHOLD-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <8 x float>*
-; THRESHOLD-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* [[TMP2]], align 4
-; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
-; THRESHOLD-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_24]] to <4 x float>*
-; THRESHOLD-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
-; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_28]], align 4
-; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
-; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_29]], align 4
-; THRESHOLD-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]])
-; THRESHOLD-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
+; THRESHOLD-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; THRESHOLD-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
+; THRESHOLD-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
+; THRESHOLD-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; THRESHOLD-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
+; THRESHOLD-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
+; THRESHOLD-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; THRESHOLD-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
+; THRESHOLD-NEXT: [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
+; THRESHOLD-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP7]])
+; THRESHOLD-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP5]])
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
+; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
-; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP6]]
-; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP7]]
-; THRESHOLD-NEXT: ret float [[OP_RDX3]]
+; THRESHOLD-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]]
+; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
+; THRESHOLD-NEXT: ret float [[TMP12]]
;
entry:
%arrayidx = getelementptr inbounds float, float* %x, i64 1
@@ -752,25 +780,25 @@ define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
-; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
-; CHECK-NEXT: ret float [[OP_RDX2]]
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; CHECK-NEXT: ret float [[OP_EXTRA1]]
;
; THRESHOLD-LABEL: @extra_args(
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
-; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
-; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
-; THRESHOLD-NEXT: ret float [[OP_RDX2]]
+; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT: ret float [[OP_EXTRA1]]
;
entry:
%mul = mul nsw i32 %b, %a
@@ -808,29 +836,29 @@ define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a
; CHECK-NEXT: entry:
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 3.000000e+00
-; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 5.000000e+00
-; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 5.000000e+00
-; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[CONV]]
-; CHECK-NEXT: [[OP_RDX4:%.*]] = fadd fast float [[OP_RDX3]], [[CONV]]
-; CHECK-NEXT: ret float [[OP_RDX4]]
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
+; CHECK-NEXT: [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], 5.000000e+00
+; CHECK-NEXT: [[OP_EXTRA3:%.*]] = fadd fast float [[OP_EXTRA2]], [[CONV]]
+; CHECK-NEXT: ret float [[OP_EXTRA3]]
;
; THRESHOLD-LABEL: @extra_args_same_several_times(
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 3.000000e+00
-; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 5.000000e+00
-; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 5.000000e+00
-; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[CONV]]
-; THRESHOLD-NEXT: [[OP_RDX4:%.*]] = fadd fast float [[OP_RDX3]], [[CONV]]
-; THRESHOLD-NEXT: ret float [[OP_RDX4]]
+; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
+; THRESHOLD-NEXT: [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], 5.000000e+00
+; THRESHOLD-NEXT: [[OP_EXTRA3:%.*]] = fadd fast float [[OP_EXTRA2]], [[CONV]]
+; THRESHOLD-NEXT: ret float [[OP_EXTRA3]]
;
entry:
%mul = mul nsw i32 %b, %a
@@ -871,28 +899,28 @@ define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; CHECK-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
+; CHECK-NEXT: [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
+; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONVC]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
-; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONV]]
-; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00
-; CHECK-NEXT: ret float [[OP_RDX3]]
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; CHECK-NEXT: ret float [[OP_EXTRA1]]
;
; THRESHOLD-LABEL: @extra_args_no_replace(
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; THRESHOLD-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
+; THRESHOLD-NEXT: [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
+; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONVC]]
-; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
-; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONV]]
-; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00
-; THRESHOLD-NEXT: ret float [[OP_RDX3]]
+; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT: ret float [[OP_EXTRA1]]
;
entry:
%mul = mul nsw i32 %b, %a
@@ -991,9 +1019,9 @@ define i32 @wobble(i32 %arg, i32 %bar) {
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[TMP6]], [[TMP3]]
-; CHECK-NEXT: [[OP_RDX2:%.*]] = add nuw i32 [[OP_RDX]], [[ARG]]
-; CHECK-NEXT: ret i32 [[OP_RDX2]]
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = add nuw i32 [[TMP6]], [[ARG]]
+; CHECK-NEXT: [[OP_EXTRA2:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP3]]
+; CHECK-NEXT: ret i32 [[OP_EXTRA2]]
;
; THRESHOLD-LABEL: @wobble(
; THRESHOLD-NEXT: bb:
@@ -1006,9 +1034,9 @@ define i32 @wobble(i32 %arg, i32 %bar) {
; THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
; THRESHOLD-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
; THRESHOLD-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[TMP6]], [[TMP3]]
-; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = add nuw i32 [[OP_RDX]], [[ARG]]
-; THRESHOLD-NEXT: ret i32 [[OP_RDX2]]
+; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = add nuw i32 [[TMP6]], [[ARG]]
+; THRESHOLD-NEXT: [[OP_EXTRA2:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP3]]
+; THRESHOLD-NEXT: ret i32 [[OP_EXTRA2]]
;
bb:
%x1 = xor i32 %arg, %bar
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
index 472ec767f540f..90a259f5efbc9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
@@ -10,28 +10,29 @@ define i32 @bar() local_unnamed_addr {
; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef
; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef
; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB102_1]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[ADD94_1]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 4
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[SUB86_1]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[ADD78_1]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 2
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_1]], i32 3
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_3]], i32 4
-; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 4>
-; CHECK-NEXT: [[TMP10:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
-; CHECK-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
-; CHECK-NEXT: [[TMP13:%.*]] = lshr <16 x i32> [[TMP12]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i32> [[TMP13]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP15:%.*]] = mul nuw <16 x i32> [[TMP14]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP16:%.*]] = add <16 x i32> [[TMP15]], [[TMP12]]
-; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP16]], [[TMP15]]
-; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP17]])
-; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP18]], 16
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB102_3]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[SUB102_1]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD94_1]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[ADD78_1]], i32 3
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[SUB86_1]], i32 4
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[ADD78_2]], i32 5
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB86_1]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD78_1]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[ADD94_1]], i32 2
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_1]], i32 3
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SUB102_3]], i32 4
+; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> poison, <16 x i32> <i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 4>
+; CHECK-NEXT: [[TMP11:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT: [[TMP12:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 21, i32 22, i32 7, i32 24, i32 25, i32 10, i32 27, i32 28, i32 13, i32 30, i32 31>
+; CHECK-NEXT: [[TMP14:%.*]] = lshr <16 x i32> [[TMP13]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i32> [[TMP14]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT: [[TMP16:%.*]] = mul nuw <16 x i32> [[TMP15]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT: [[TMP17:%.*]] = add <16 x i32> [[TMP16]], [[TMP13]]
+; CHECK-NEXT: [[TMP18:%.*]] = xor <16 x i32> [[TMP17]], [[TMP16]]
+; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP18]])
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP19]], 16
; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 undef, [[SHR]]
; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
; CHECK-NEXT: ret i32 [[SHR120]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index 5769a293d0357..460f222763b8c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -230,12 +230,17 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
; CHECK-LABEL: @logical_and_icmp_clamp(
; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]])
-; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP4]], i1 [[TMP6]], i1 false
-; CHECK-NEXT: ret i1 [[OP_RDX]]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; CHECK-NEXT: [[S4:%.*]] = select i1 [[TMP4]], i1 [[TMP5]], i1 false
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[TMP6]], i1 false
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[TMP7]], i1 false
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[TMP8]], i1 false
+; CHECK-NEXT: ret i1 [[S7]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1
@@ -260,17 +265,53 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
}
define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
-; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
-; CHECK-NEXT: call void @use1(i1 [[TMP2]])
-; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; CHECK-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]])
-; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP1]]
-; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 [[TMP7]], i1 false
-; CHECK-NEXT: ret i1 [[OP_RDX]]
+; SSE-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
+; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; SSE-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; SSE-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42
+; SSE-NEXT: call void @use1(i1 [[C2]])
+; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
+; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
+; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X3]], i32 2
+; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3
+; SSE-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
+; SSE-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
+; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; SSE-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; SSE-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
+; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
+; SSE-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i1 [[C2]], i1 false
+; SSE-NEXT: [[S5:%.*]] = select i1 [[OP_EXTRA]], i1 [[D1]], i1 false
+; SSE-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; SSE-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; SSE-NEXT: ret i1 [[S7]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
+; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42
+; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42
+; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42
+; AVX-NEXT: call void @use1(i1 [[C2]])
+; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42
+; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17
+; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
+; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
+; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
+; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
+; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
+; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; AVX-NEXT: ret i1 [[S7]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1
@@ -297,20 +338,27 @@ define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) {
; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_select(
-; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
-; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
-; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP5]], i1 false
+; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42
+; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42
+; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42
+; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42
+; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17
+; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
+; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
; CHECK-NEXT: call void @use1(i1 [[S2]])
-; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]]
-; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
-; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 [[TMP8]], i1 false
-; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[S2]], i1 false
-; CHECK-NEXT: ret i1 [[OP_RDX1]]
+; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
+; CHECK-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
+; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
+; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; CHECK-NEXT: ret i1 [[S7]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1
@@ -345,11 +393,11 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1
; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2
; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X1]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X0]], i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X1]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[X2]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[X3]], i32 3
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 2, i32 3>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[Y0]], i32 4
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y1]], i32 5
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[Y2]], i32 6
@@ -387,37 +435,46 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
; SSE-LABEL: @logical_and_icmp_clamp_partial(
-; SSE-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
-; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0
-; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
-; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1
-; SSE-NEXT: [[TMP6:%.*]] = icmp slt <2 x i32> [[TMP5]], <i32 42, i32 42>
-; SSE-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42
-; SSE-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
+; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; SSE-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
+; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
+; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2
+; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3
+; SSE-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
+; SSE-NEXT: [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
+; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; SSE-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; SSE-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17
; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
-; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
-; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP9]], i1 [[TMP10]], i1 false
-; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
-; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[TMP11]], i1 false
-; SSE-NEXT: [[OP_RDX2:%.*]] = select i1 [[OP_RDX1]], i1 [[C2]], i1 false
-; SSE-NEXT: ret i1 [[OP_RDX2]]
+; SSE-NEXT: [[S5:%.*]] = select i1 [[TMP9]], i1 [[D1]], i1 false
+; SSE-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; SSE-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; SSE-NEXT: ret i1 [[S7]]
;
; AVX-LABEL: @logical_and_icmp_clamp_partial(
-; AVX-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
-; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0
-; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[TMP3]], 42
-; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[TMP2]], 42
-; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42
-; AVX-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; AVX-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
-; AVX-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
-; AVX-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[C1]], i1 false
-; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C0]], i1 false
-; AVX-NEXT: [[OP_RDX2:%.*]] = select i1 [[OP_RDX1]], i1 [[C2]], i1 false
-; AVX-NEXT: ret i1 [[OP_RDX2]]
+; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42
+; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42
+; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42
+; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17
+; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
+; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
+; AVX-NEXT: [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false
+; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
+; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; AVX-NEXT: ret i1 [[S7]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1
@@ -442,17 +499,44 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
}
define i1 @logical_and_icmp_clamp_pred_
diff (<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp_pred_
diff (
-; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false
-; CHECK-NEXT: ret i1 [[OP_RDX]]
+; SSE-LABEL: @logical_and_icmp_clamp_pred_
diff (
+; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
+; SSE-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
+; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP3]]
+; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
+; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
+; SSE-NEXT: [[S4:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
+; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
+; SSE-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[TMP8]], i1 false
+; SSE-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
+; SSE-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[TMP9]], i1 false
+; SSE-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
+; SSE-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[TMP10]], i1 false
+; SSE-NEXT: ret i1 [[S7]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp_pred_
diff (
+; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42
+; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42
+; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42
+; AVX-NEXT: [[C3:%.*]] = icmp ult i32 [[X3]], 42
+; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17
+; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
+; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
+; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
+; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
+; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
+; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; AVX-NEXT: ret i1 [[S7]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1
@@ -482,8 +566,8 @@ define i1 @logical_and_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
; CHECK-NEXT: [[S3:%.*]] = select i1 [[C:%.*]], i1 [[C]], i1 false
; CHECK-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 [[S3]], i1 false
-; CHECK-NEXT: ret i1 [[OP_RDX]]
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP3]], i1 [[S3]], i1 false
+; CHECK-NEXT: ret i1 [[OP_EXTRA]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1
@@ -511,8 +595,8 @@ define i1 @logical_or_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
; CHECK-NEXT: [[S3:%.*]] = select i1 [[C:%.*]], i1 true, i1 [[C]]
; CHECK-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 true, i1 [[S3]]
-; CHECK-NEXT: ret i1 [[OP_RDX]]
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP3]], i1 true, i1 [[S3]]
+; CHECK-NEXT: ret i1 [[OP_EXTRA]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
index 0fac440081208..362967fcc3c11 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@@ -24,15 +24,15 @@ define i32 @test(i32* nocapture readonly %p) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <8 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP3]], [[SUM]]
+; CHECK-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[SUM]]
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 [[OP_RDX]]
+; CHECK-NEXT: ret i32 [[OP_EXTRA]]
;
entry:
%arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
@@ -97,17 +97,17 @@ define i32 @test2(i32* nocapture readonly %p, i32* nocapture readonly %q) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <8 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[Q:%.*]] to <8 x i32>*
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
-; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP5]], [[SUM]]
+; CHECK-NEXT: [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]]
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 [[OP_RDX]]
+; CHECK-NEXT: ret i32 [[OP_EXTRA]]
;
entry:
%arrayidx.p.1 = getelementptr inbounds i32, i32* %p, i64 1
@@ -188,18 +188,18 @@ define i32 @test3(i32* nocapture readonly %p, i32* nocapture readonly %q) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <8 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[Q:%.*]] to <8 x i32>*
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> [[SHUFFLE]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
-; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP5]], [[SUM]]
+; CHECK-NEXT: [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]]
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 [[OP_RDX]]
+; CHECK-NEXT: ret i32 [[OP_EXTRA]]
;
entry:
%arrayidx.p.1 = getelementptr inbounds i32, i32* %p, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
index 7e0dc28681e47..28417c0ac0201 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -11,9 +11,9 @@
; }
; Vector cost is 5, Scalar cost is 7
-; AVX: Adding cost -2 for reduction that starts with %0 = load i32, i32* %p, align 4 (It is a splitting reduction)
+; AVX: Adding cost -2 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
; Vector cost is 6, Scalar cost is 7
-; SSE: Adding cost -1 for reduction that starts with %0 = load i32, i32* %p, align 4 (It is a splitting reduction)
+; SSE: Adding cost -1 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
define i32 @test_add(i32* nocapture readonly %p) {
; CHECK-LABEL: @test_add(
; CHECK-NEXT: entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
index 8202e5e4a722b..0e5d3bca07549 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
@@ -15,8 +15,8 @@ define void @hoge() {
; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i16> [[TMP0]] to <2 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> <i32 undef, i32 63>, [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], undef
-; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
-; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE4]], <i32 15, i32 undef, i32 31, i32 47>
+; CHECK-NEXT: [[SHUFFLE10:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE10]], <i32 undef, i32 15, i32 31, i32 47>
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef
; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63
@@ -24,13 +24,18 @@ define void @hoge() {
; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP6]], undef
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -49, i32 -33, i32 -33, i32 -17>
-; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
-; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 [[TMP9]], [[TMP10]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP10]]
-; CHECK-NEXT: [[OP_RDX2:%.*]] = icmp slt i32 [[OP_RDX1]], undef
-; CHECK-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 undef
-; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_RDX3]]
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = icmp slt i32 [[TMP9]], undef
+; CHECK-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP9]], i32 undef
+; CHECK-NEXT: [[OP_EXTRA2:%.*]] = icmp slt i32 [[OP_EXTRA1]], undef
+; CHECK-NEXT: [[OP_EXTRA3:%.*]] = select i1 [[OP_EXTRA2]], i32 [[OP_EXTRA1]], i32 undef
+; CHECK-NEXT: [[OP_EXTRA4:%.*]] = icmp slt i32 [[OP_EXTRA3]], undef
+; CHECK-NEXT: [[OP_EXTRA5:%.*]] = select i1 [[OP_EXTRA4]], i32 [[OP_EXTRA3]], i32 undef
+; CHECK-NEXT: [[OP_EXTRA6:%.*]] = icmp slt i32 [[OP_EXTRA5]], undef
+; CHECK-NEXT: [[OP_EXTRA7:%.*]] = select i1 [[OP_EXTRA6]], i32 [[OP_EXTRA5]], i32 undef
+; CHECK-NEXT: [[OP_EXTRA8:%.*]] = icmp slt i32 [[OP_EXTRA7]], undef
+; CHECK-NEXT: [[OP_EXTRA9:%.*]] = select i1 [[OP_EXTRA8]], i32 [[OP_EXTRA7]], i32 undef
+; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA9]]
; CHECK-NEXT: unreachable
;
bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
index e46169a83b97a..a9a6aa937171b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
@@ -21,19 +21,19 @@ define void @test() {
; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* undef, i64 0, i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[I]] to <4 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 8
+; CHECK-NEXT: [[I5:%.*]] = add i32 undef, undef
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
-; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[TMP2]], undef
-; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[OP_RDX3]], undef
-; CHECK-NEXT: [[I10:%.*]] = add i32 [[OP_RDX4]], undef
-; CHECK-NEXT: [[I11:%.*]] = add i32 [[OP_RDX4]], [[I10]]
+; CHECK-NEXT: [[OP_EXTRA2:%.*]] = add i32 [[TMP2]], [[I5]]
+; CHECK-NEXT: [[I10:%.*]] = add i32 [[OP_EXTRA2]], undef
+; CHECK-NEXT: [[I11:%.*]] = add i32 [[OP_EXTRA2]], [[I10]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[I1]] to <4 x i32>*
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT: [[I12:%.*]] = add i32 undef, undef
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP5]], undef
-; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], undef
-; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], undef
-; CHECK-NEXT: [[I18:%.*]] = add i32 [[OP_RDX2]], [[I11]]
-; CHECK-NEXT: [[I19:%.*]] = add i32 [[OP_RDX2]], [[I18]]
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP5]], [[I12]]
+; CHECK-NEXT: [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], undef
+; CHECK-NEXT: [[I18:%.*]] = add i32 [[OP_EXTRA1]], [[I11]]
+; CHECK-NEXT: [[I19:%.*]] = add i32 [[OP_EXTRA1]], [[I18]]
; CHECK-NEXT: [[I20:%.*]] = add i32 undef, [[I19]]
; CHECK-NEXT: [[I21:%.*]] = add i32 undef, [[I20]]
; CHECK-NEXT: [[I22:%.*]] = add i32 undef, [[I21]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
index 654d6881d03e4..9502620a6b494 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
@@ -7,8 +7,8 @@ define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR:%.*]] to <2 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
@@ -53,9 +53,9 @@ define i32 @foo1(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR:%.*]] to <4 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 2, i32 1, i32 3, i32 1, i32 1, i32 0, i32 2, i32 1>
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 1, i32 0, i32 2, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
@@ -104,9 +104,9 @@ define i32 @foo2(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR:%.*]] to <4 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 2, i32 3, i32 3, i32 0, i32 1, i32 0, i32 2, i32 1>
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 3, i32 2, i32 3, i32 0, i32 1, i32 0, i32 2, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
diff --git a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
index b2ae15cf6166f..f48d5e27e8aa6 100644
--- a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
@@ -43,7 +43,7 @@ declare i32 @llvm.umin.i32(i32, i32)
define void @test2() {
; CHECK-LABEL: @test2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <4 x i32> poison, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 77)
More information about the llvm-commits
mailing list