[llvm] r294934 - [SLP] Fix for PR31690: Allow using of extra values in horizontal
Andrew Adams via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 21 18:39:20 PST 2017
I'm seeing a horizontal sum reduction on ints produce different results
starting at this commit. Details here:
https://bugs.llvm.org/show_bug.cgi?id=32036
On Mon, Feb 13, 2017 at 12:01 AM, Alexey Bataev via llvm-commits <
llvm-commits at lists.llvm.org> wrote:
> Author: abataev
> Date: Mon Feb 13 02:01:26 2017
> New Revision: 294934
>
> URL: http://llvm.org/viewvc/llvm-project?rev=294934&view=rev
> Log:
> [SLP] Fix for PR31690: Allow using of extra values in horizontal
> reductions.
>
> Currently, LLVM supports vectorization of horizontal reduction
> instructions with initial value set to 0. Patch supports vectorization
> of reduction with non-zero initial values. Also, it supports a
> vectorization of instructions with some extra arguments, like:
> ```
> float f(float x[], int a, int b) {
> float p = a % b;
> p += x[0] + 3;
> for (int i = 1; i < 32; i++)
> p += x[i];
> return p;
> }
> ```
> Patch allows vectorization of this kind of horizontal reductions.
>
> Differential Revision: https://reviews.llvm.org/D29727
>
> Modified:
> llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
> llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/
> Transforms/Vectorize/SLPVectorizer.cpp?rev=294934&
> r1=294933&r2=294934&view=diff
> ============================================================
> ==================
> --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Mon Feb 13
> 02:01:26 2017
> @@ -330,6 +330,10 @@ public:
> /// \brief Vectorize the tree that starts with the elements in \p VL.
> /// Returns the vectorized root.
> Value *vectorizeTree();
> + /// Vectorize the tree but with the list of externally used values \p
> + /// ExternallyUsedValues. Values in this MapVector can be replaced but
> the
> + /// generated extractvalue instructions.
> + Value *vectorizeTree(MapVector<Value *, DebugLoc>
> &ExternallyUsedValues);
>
> /// \returns the cost incurred by unwanted spills and fills, caused by
> /// holding live values over call sites.
> @@ -343,6 +347,13 @@ public:
> /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
> void buildTree(ArrayRef<Value *> Roots,
> ArrayRef<Value *> UserIgnoreLst = None);
> + /// Construct a vectorizable tree that starts at \p Roots, ignoring
> users for
> + /// the purpose of scheduling and extraction in the \p UserIgnoreLst
> taking
> + /// into account (anf updating it, if required) list of externally used
> + /// values stored in \p ExternallyUsedValues.
> + void buildTree(ArrayRef<Value *> Roots,
> + MapVector<Value *, DebugLoc> &ExternallyUsedValues,
> + ArrayRef<Value *> UserIgnoreLst = None);
>
> /// Clear the internal data structures that are created by 'buildTree'.
> void deleteTree() {
> @@ -576,7 +587,9 @@ private:
> SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
>
> /// A list of values that need to extracted out of the tree.
> - /// This list holds pairs of (Internal Scalar : External User).
> + /// This list holds pairs of (Internal Scalar : External User).
> External User
> + /// can be nullptr, it means that this Internal Scalar will be used
> later,
> + /// after vectorization.
> UserList ExternalUses;
>
> /// Values used only by @llvm.assume calls.
> @@ -940,6 +953,12 @@ private:
>
> void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
> ArrayRef<Value *> UserIgnoreLst) {
> + MapVector<Value *, DebugLoc> ExternallyUsedValues;
> + buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
> +}
> +void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
> + MapVector<Value *, DebugLoc>
> &ExternallyUsedValues,
> + ArrayRef<Value *> UserIgnoreLst) {
> deleteTree();
> UserIgnoreList = UserIgnoreLst;
> if (!allSameType(Roots))
> @@ -958,6 +977,14 @@ void BoUpSLP::buildTree(ArrayRef<Value *
> if (Entry->NeedToGather)
> continue;
>
> + // Check if the scalar is externally used as an extra arg.
> + auto ExtI = ExternallyUsedValues.find(Scalar);
> + if (ExtI != ExternallyUsedValues.end()) {
> + DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
> + Lane << " from " << *Scalar << ".\n");
> + ExternalUses.emplace_back(Scalar, nullptr, Lane);
> + continue;
> + }
> for (User *U : Scalar->users()) {
> DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
>
> @@ -2768,6 +2795,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<V
> }
>
> Value *BoUpSLP::vectorizeTree() {
> + MapVector<Value *, DebugLoc> ExternallyUsedValues;
> + return vectorizeTree(ExternallyUsedValues);
> +}
> +
> +Value *
> +BoUpSLP::vectorizeTree(MapVector<Value *, DebugLoc>
> &ExternallyUsedValues) {
>
> // All blocks must be scheduled before any instructions are inserted.
> for (auto &BSIter : BlocksSchedules) {
> @@ -2810,7 +2843,7 @@ Value *BoUpSLP::vectorizeTree() {
>
> // Skip users that we already RAUW. This happens when one instruction
> // has multiple uses of the same value.
> - if (!is_contained(Scalar->users(), User))
> + if (User && !is_contained(Scalar->users(), User))
> continue;
> assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
>
> @@ -2822,6 +2855,28 @@ Value *BoUpSLP::vectorizeTree() {
> assert(Vec && "Can't find vectorizable value");
>
> Value *Lane = Builder.getInt32(ExternalUse.Lane);
> + // If User == nullptr, the Scalar is used as extra arg. Generate
> + // ExtractElement instruction and update the record for this scalar in
> + // ExternallyUsedValues.
> + if (!User) {
> + assert(ExternallyUsedValues.count(Scalar) &&
> + "Scalar with nullptr as an external user must be registered
> in "
> + "ExternallyUsedValues map");
> + DebugLoc DL = ExternallyUsedValues[Scalar];
> + if (auto *VecI = dyn_cast<Instruction>(Vec)) {
> + Builder.SetInsertPoint(VecI->getParent(),
> + std::next(VecI->getIterator()));
> + } else {
> + Builder.SetInsertPoint(&F->getEntryBlock().front());
> + }
> + Value *Ex = Builder.CreateExtractElement(Vec, Lane);
> + Ex = extend(ScalarRoot, Ex, Scalar->getType());
> + CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
> + ExternallyUsedValues.erase(Scalar);
> + ExternallyUsedValues[Ex] = DL;
> + continue;
> + }
> +
> // Generate extracts for out-of-tree users.
> // Find the insertion point for the extractelement lane.
> if (auto *VecI = dyn_cast<Instruction>(Vec)) {
> @@ -4189,6 +4244,8 @@ namespace {
> class HorizontalReduction {
> SmallVector<Value *, 16> ReductionOps;
> SmallVector<Value *, 32> ReducedVals;
> + // Use map vector to make stable output.
> + MapVector<Instruction *, Value *> ExtraArgs;
>
> BinaryOperator *ReductionRoot = nullptr;
> // After successfull horizontal reduction vectorization attempt for PHI
> node
> @@ -4208,6 +4265,26 @@ class HorizontalReduction {
> /// splits the vector in halves and adds those halves.
> bool IsPairwiseReduction = false;
>
> + /// Checks if the ParentStackElem.first should be marked as a reduction
> + /// operation with an extra argument or as extra argument itself.
> + void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
> + Value *ExtraArg) {
> + if (ExtraArgs.count(ParentStackElem.first)) {
> + ExtraArgs[ParentStackElem.first] = nullptr;
> + // We ran into something like:
> + // ParentStackElem.first = ExtraArgs[ParentStackElem.first] +
> ExtraArg.
> + // The whole ParentStackElem.first should be considered as an extra
> value
> + // in this case.
> + // Do not perform analysis of remaining operands of
> ParentStackElem.first
> + // instruction, this whole instruction is an extra argument.
> + ParentStackElem.second = ParentStackElem.first->getNumOperands();
> + } else {
> + // We ran into something like:
> + // ParentStackElem.first += ... + ExtraArg + ...
> + ExtraArgs[ParentStackElem.first] = ExtraArg;
> + }
> + }
> +
> public:
> HorizontalReduction() = default;
>
> @@ -4260,8 +4337,23 @@ public:
> if (EdgeToVist == 2 || IsReducedValue) {
> if (IsReducedValue)
> ReducedVals.push_back(TreeN);
> - else
> - ReductionOps.push_back(TreeN);
> + else {
> + auto I = ExtraArgs.find(TreeN);
> + if (I != ExtraArgs.end() && !I->second) {
> + // Check if TreeN is an extra argument of its parent
> operation.
> + if (Stack.size() <= 1) {
> + // TreeN can't be an extra argument as it is a root
> reduction
> + // operation.
> + return false;
> + }
> + // Yes, TreeN is an extra argument, do not add it to a list of
> + // reduction operations.
> + // Stack[Stack.size() - 2] always points to the parent
> operation.
> + markExtraArg(Stack[Stack.size() - 2], TreeN);
> + ExtraArgs.erase(TreeN);
> + } else
> + ReductionOps.push_back(TreeN);
> + }
> // Retract.
> Stack.pop_back();
> continue;
> @@ -4278,30 +4370,42 @@ public:
> if (I && (!ReducedValueOpcode || I->getOpcode() ==
> ReducedValueOpcode ||
> I->getOpcode() == ReductionOpcode)) {
> // Only handle trees in the current basic block.
> - if (I->getParent() != B->getParent())
> - return false;
> + if (I->getParent() != B->getParent()) {
> + // I is an extra argument for TreeN (its parent operation).
> + markExtraArg(Stack.back(), I);
> + continue;
> + }
>
> // Each tree node needs to have one user except for the ultimate
> // reduction.
> - if (!I->hasOneUse() && I != B)
> - return false;
> + if (!I->hasOneUse() && I != B) {
> + // I is an extra argument for TreeN (its parent operation).
> + markExtraArg(Stack.back(), I);
> + continue;
> + }
>
> if (I->getOpcode() == ReductionOpcode) {
> // We need to be able to reassociate the reduction operations.
> - if (!I->isAssociative())
> - return false;
> + if (!I->isAssociative()) {
> + // I is an extra argument for TreeN (its parent operation).
> + markExtraArg(Stack.back(), I);
> + continue;
> + }
> } else if (ReducedValueOpcode &&
> ReducedValueOpcode != I->getOpcode()) {
> // Make sure that the opcodes of the operations that we are
> going to
> // reduce match.
> - return false;
> + // I is an extra argument for TreeN (its parent operation).
> + markExtraArg(Stack.back(), I);
> + continue;
> } else if (!ReducedValueOpcode)
> ReducedValueOpcode = I->getOpcode();
>
> Stack.push_back(std::make_pair(I, 0));
> continue;
> }
> - return false;
> + // NextV is an extra argument for TreeN (its parent operation).
> + markExtraArg(Stack.back(), NextV);
> }
> }
> return true;
> @@ -4329,12 +4433,15 @@ public:
> Builder.setFastMathFlags(Unsafe);
> unsigned i = 0;
>
> + MapVector<Value *, DebugLoc> ExternallyUsedValues;
> + for (auto &Pair : ExtraArgs)
> + ExternallyUsedValues[Pair.second] = Pair.first->getDebugLoc();
> while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
> auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
> - V.buildTree(VL, ReductionOps);
> + V.buildTree(VL, ExternallyUsedValues, ReductionOps);
> if (V.shouldReorder()) {
> SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
> - V.buildTree(Reversed, ReductionOps);
> + V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);
> }
> if (V.isTreeTinyAndNotFullyVectorizable())
> break;
> @@ -4352,7 +4459,7 @@ public:
>
> // Vectorize a tree.
> DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
> - Value *VectorizedRoot = V.vectorizeTree();
> + Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
>
> // Emit a reduction.
> Value *ReducedSubTree =
> @@ -4370,10 +4477,15 @@ public:
> if (VectorizedTree) {
> // Finish the reduction.
> for (; i < NumReducedVals; ++i) {
> - Builder.SetCurrentDebugLocation(
> - cast<Instruction>(ReducedVals[i])->getDebugLoc());
> + auto *I = cast<Instruction>(ReducedVals[i]);
> + Builder.SetCurrentDebugLocation(I->getDebugLoc());
> + VectorizedTree =
> + Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);
> + }
> + for (auto &Pair : ExternallyUsedValues) {
> + Builder.SetCurrentDebugLocation(Pair.second);
> VectorizedTree = Builder.CreateBinOp(ReductionOpcode,
> VectorizedTree,
> - ReducedVals[i]);
> + Pair.first, "bin.extra");
> }
> // Update users.
> if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
>
> Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/
> Transforms/SLPVectorizer/X86/horizontal-list.ll?rev=294934&
> r1=294933&r2=294934&view=diff
> ============================================================
> ==================
> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
> (original)
> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll Mon
> Feb 13 02:01:26 2017
> @@ -97,78 +97,62 @@ define float @bazz() {
> ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4
> ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
> ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
> -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast
> ([20 x float]* @arr to <2 x float>*), align 16
> -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast
> ([20 x float]* @arr1 to <2 x float>*), align 16
> -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
> -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
> -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
> -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast
> (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0,
> i64 2) to <2 x float>*), align 8
> -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast
> (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0,
> i64 2) to <2 x float>*), align 8
> -; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
> -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32
> 1
> -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
> +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast
> ([20 x float]* @arr to <8 x float>*), align 16
> +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast
> ([20 x float]* @arr1 to <8 x float>*), align 16
> +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
> +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float undef, [[CONV]]
> +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
> +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
> +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
> ; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
> ; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
> ; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
> -; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* bitcast
> (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0,
> i64 4) to <2 x float>*), align 16
> -; CHECK-NEXT: [[TMP12:%.*]] = load <2 x float>, <2 x float>* bitcast
> (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0,
> i64 4) to <2 x float>*), align 16
> -; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[TMP12]],
> [[TMP11]]
> -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP13]],
> i32 0
> -; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP14]], [[ADD7]]
> -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP13]],
> i32 1
> -; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP15]], [[ADD19]]
> -; CHECK-NEXT: [[TMP16:%.*]] = load <2 x float>, <2 x float>* bitcast
> (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0,
> i64 6) to <2 x float>*), align 8
> -; CHECK-NEXT: [[TMP17:%.*]] = load <2 x float>, <2 x float>* bitcast
> (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0,
> i64 6) to <2 x float>*), align 8
> -; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <2 x float> [[TMP17]],
> [[TMP16]]
> -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP18]],
> i32 0
> -; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP19]], [[ADD19_1]]
> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP18]],
> i32 1
> -; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP20]], [[ADD19_2]]
> -; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4
> -; CHECK-NEXT: ret float [[ADD19_3]]
> +; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float undef, [[ADD7]]
> +; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]]
> +; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]]
> +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]],
> <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32
> undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]],
> [[RDX_SHUF]]
> +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float>
> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]],
> [[RDX_SHUF1]]
> +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float>
> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]],
> [[RDX_SHUF3]]
> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]],
> i32 0
> +; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
> +; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]],
> [[CONV6]]
> +; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
> +; CHECK-NEXT: store float [[BIN_EXTRA5]], float* @res, align 4
> +; CHECK-NEXT: ret float [[BIN_EXTRA5]]
> ;
> ; THRESHOLD-LABEL: @bazz(
> ; THRESHOLD-NEXT: entry:
> ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4
> ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
> ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
> -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>*
> bitcast ([20 x float]* @arr to <2 x float>*), align 16
> -; THRESHOLD-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>*
> bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
> -; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]],
> [[TMP1]]
> -; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]],
> i32 0
> -; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
> -; THRESHOLD-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]],
> i32 1
> -; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
> -; THRESHOLD-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>*
> bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr,
> i64 0, i64 2) to <2 x float>*), align 8
> -; THRESHOLD-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>*
> bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1,
> i64 0, i64 2) to <2 x float>*), align 8
> -; THRESHOLD-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]],
> [[TMP6]]
> -; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]],
> i32 0
> -; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
> -; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]],
> i32 1
> -; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
> +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>*
> bitcast ([20 x float]* @arr to <8 x float>*), align 16
> +; THRESHOLD-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>*
> bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
> +; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]],
> [[TMP1]]
> +; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float undef, [[CONV]]
> +; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
> +; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
> +; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
> ; THRESHOLD-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
> ; THRESHOLD-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
> ; THRESHOLD-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
> -; THRESHOLD-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>*
> bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr,
> i64 0, i64 4) to <2 x float>*), align 16
> -; THRESHOLD-NEXT: [[TMP12:%.*]] = load <2 x float>, <2 x float>*
> bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1,
> i64 0, i64 4) to <2 x float>*), align 16
> -; THRESHOLD-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[TMP12]],
> [[TMP11]]
> -; THRESHOLD-NEXT: [[TMP14:%.*]] = extractelement <2 x float>
> [[TMP13]], i32 0
> -; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP14]], [[ADD7]]
> -; THRESHOLD-NEXT: [[TMP15:%.*]] = extractelement <2 x float>
> [[TMP13]], i32 1
> -; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP15]],
> [[ADD19]]
> -; THRESHOLD-NEXT: [[TMP16:%.*]] = load <2 x float>, <2 x float>*
> bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr,
> i64 0, i64 6) to <2 x float>*), align 8
> -; THRESHOLD-NEXT: [[TMP17:%.*]] = load <2 x float>, <2 x float>*
> bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1,
> i64 0, i64 6) to <2 x float>*), align 8
> -; THRESHOLD-NEXT: [[TMP18:%.*]] = fmul fast <2 x float> [[TMP17]],
> [[TMP16]]
> -; THRESHOLD-NEXT: [[TMP19:%.*]] = extractelement <2 x float>
> [[TMP18]], i32 0
> -; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP19]],
> [[ADD19_1]]
> -; THRESHOLD-NEXT: [[TMP20:%.*]] = extractelement <2 x float>
> [[TMP18]], i32 1
> -; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP20]],
> [[ADD19_2]]
> -; THRESHOLD-NEXT: store float [[ADD19_3]], float* @res, align 4
> -; THRESHOLD-NEXT: ret float [[ADD19_3]]
> +; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float undef, [[ADD7]]
> +; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]]
> +; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]]
> +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float>
> [[TMP3]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32
> undef, i32 undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]],
> [[RDX_SHUF]]
> +; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float>
> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float>
> [[BIN_RDX]], [[RDX_SHUF1]]
> +; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float>
> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float>
> [[BIN_RDX2]], [[RDX_SHUF3]]
> +; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <8 x float>
> [[BIN_RDX4]], i32 0
> +; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]],
> [[CONV]]
> +; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]],
> [[CONV6]]
> +; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
> +; THRESHOLD-NEXT: store float [[BIN_EXTRA5]], float* @res, align 4
> +; THRESHOLD-NEXT: ret float [[BIN_EXTRA5]]
> ;
> entry:
> %0 = load i32, i32* @n, align 4
> @@ -806,203 +790,167 @@ define float @f1(float* nocapture readon
> ; CHECK-NEXT: entry:
> ; CHECK-NEXT: [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
> ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[REM]] to float
> -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4
> -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], [[CONV]]
> -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 1
> -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
> -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[ADD]]
> +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float,
> float* [[X:%.*]], i64 1
> ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 2
> -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4
> -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP2]], [[ADD_1]]
> ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 3
> -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4
> -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP3]], [[ADD_2]]
> ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 4
> -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4
> -; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP4]], [[ADD_3]]
> ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 5
> -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4
> -; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP5]], [[ADD_4]]
> ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 6
> -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_6]], align 4
> -; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP6]], [[ADD_5]]
> ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 7
> -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_7]], align 4
> -; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float [[TMP7]], [[ADD_6]]
> ; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 8
> -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX_8]], align 4
> -; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float [[TMP8]], [[ADD_7]]
> ; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 9
> -; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX_9]], align 4
> -; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float [[TMP9]], [[ADD_8]]
> ; CHECK-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 10
> -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX_10]],
> align 4
> -; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float [[TMP10]], [[ADD_9]]
> ; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 11
> -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX_11]],
> align 4
> -; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float [[TMP11]], [[ADD_10]]
> ; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 12
> -; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX_12]],
> align 4
> -; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float [[TMP12]], [[ADD_11]]
> ; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 13
> -; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX_13]],
> align 4
> -; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float [[TMP13]], [[ADD_12]]
> ; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 14
> -; CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX_14]],
> align 4
> -; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float [[TMP14]], [[ADD_13]]
> ; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 15
> -; CHECK-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX_15]],
> align 4
> -; CHECK-NEXT: [[ADD_15:%.*]] = fadd fast float [[TMP15]], [[ADD_14]]
> ; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 16
> -; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[ARRAYIDX_16]],
> align 4
> -; CHECK-NEXT: [[ADD_16:%.*]] = fadd fast float [[TMP16]], [[ADD_15]]
> ; CHECK-NEXT: [[ARRAYIDX_17:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 17
> -; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX_17]],
> align 4
> -; CHECK-NEXT: [[ADD_17:%.*]] = fadd fast float [[TMP17]], [[ADD_16]]
> ; CHECK-NEXT: [[ARRAYIDX_18:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 18
> -; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX_18]],
> align 4
> -; CHECK-NEXT: [[ADD_18:%.*]] = fadd fast float [[TMP18]], [[ADD_17]]
> ; CHECK-NEXT: [[ARRAYIDX_19:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 19
> -; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX_19]],
> align 4
> -; CHECK-NEXT: [[ADD_19:%.*]] = fadd fast float [[TMP19]], [[ADD_18]]
> ; CHECK-NEXT: [[ARRAYIDX_20:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 20
> -; CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[ARRAYIDX_20]],
> align 4
> -; CHECK-NEXT: [[ADD_20:%.*]] = fadd fast float [[TMP20]], [[ADD_19]]
> ; CHECK-NEXT: [[ARRAYIDX_21:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 21
> -; CHECK-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX_21]],
> align 4
> -; CHECK-NEXT: [[ADD_21:%.*]] = fadd fast float [[TMP21]], [[ADD_20]]
> ; CHECK-NEXT: [[ARRAYIDX_22:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 22
> -; CHECK-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX_22]],
> align 4
> -; CHECK-NEXT: [[ADD_22:%.*]] = fadd fast float [[TMP22]], [[ADD_21]]
> ; CHECK-NEXT: [[ARRAYIDX_23:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 23
> -; CHECK-NEXT: [[TMP23:%.*]] = load float, float* [[ARRAYIDX_23]],
> align 4
> -; CHECK-NEXT: [[ADD_23:%.*]] = fadd fast float [[TMP23]], [[ADD_22]]
> ; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 24
> -; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX_24]],
> align 4
> -; CHECK-NEXT: [[ADD_24:%.*]] = fadd fast float [[TMP24]], [[ADD_23]]
> ; CHECK-NEXT: [[ARRAYIDX_25:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 25
> -; CHECK-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX_25]],
> align 4
> -; CHECK-NEXT: [[ADD_25:%.*]] = fadd fast float [[TMP25]], [[ADD_24]]
> ; CHECK-NEXT: [[ARRAYIDX_26:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 26
> -; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX_26]],
> align 4
> -; CHECK-NEXT: [[ADD_26:%.*]] = fadd fast float [[TMP26]], [[ADD_25]]
> ; CHECK-NEXT: [[ARRAYIDX_27:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 27
> -; CHECK-NEXT: [[TMP27:%.*]] = load float, float* [[ARRAYIDX_27]],
> align 4
> -; CHECK-NEXT: [[ADD_27:%.*]] = fadd fast float [[TMP27]], [[ADD_26]]
> ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 28
> -; CHECK-NEXT: [[TMP28:%.*]] = load float, float* [[ARRAYIDX_28]],
> align 4
> -; CHECK-NEXT: [[ADD_28:%.*]] = fadd fast float [[TMP28]], [[ADD_27]]
> ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 29
> -; CHECK-NEXT: [[TMP29:%.*]] = load float, float* [[ARRAYIDX_29]],
> align 4
> -; CHECK-NEXT: [[ADD_29:%.*]] = fadd fast float [[TMP29]], [[ADD_28]]
> ; CHECK-NEXT: [[ARRAYIDX_30:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 30
> -; CHECK-NEXT: [[TMP30:%.*]] = load float, float* [[ARRAYIDX_30]],
> align 4
> -; CHECK-NEXT: [[ADD_30:%.*]] = fadd fast float [[TMP30]], [[ADD_29]]
> ; CHECK-NEXT: [[ARRAYIDX_31:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 31
> -; CHECK-NEXT: [[TMP31:%.*]] = load float, float* [[ARRAYIDX_31]],
> align 4
> -; CHECK-NEXT: [[ADD_31:%.*]] = fadd fast float [[TMP31]], [[ADD_30]]
> -; CHECK-NEXT: ret float [[ADD_31]]
> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
> +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>*
> [[TMP0]], align 4
> +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float undef, [[CONV]]
> +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
> +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
> +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
> +; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
> +; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
> +; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
> +; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
> +; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
> +; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
> +; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
> +; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
> +; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
> +; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
> +; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
> +; CHECK-NEXT: [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
> +; CHECK-NEXT: [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
> +; CHECK-NEXT: [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
> +; CHECK-NEXT: [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
> +; CHECK-NEXT: [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
> +; CHECK-NEXT: [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
> +; CHECK-NEXT: [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
> +; CHECK-NEXT: [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
> +; CHECK-NEXT: [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
> +; CHECK-NEXT: [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
> +; CHECK-NEXT: [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
> +; CHECK-NEXT: [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
> +; CHECK-NEXT: [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
> +; CHECK-NEXT: [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
> +; CHECK-NEXT: [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
> +; CHECK-NEXT: [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
> +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]],
> <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32
> 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30,
> i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]],
> [[RDX_SHUF]]
> +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float>
> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11,
> i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]],
> [[RDX_SHUF1]]
> +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float>
> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]],
> [[RDX_SHUF3]]
> +; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float>
> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]],
> [[RDX_SHUF5]]
> +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float>
> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]],
> [[RDX_SHUF7]]
> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]],
> i32 0
> +; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
> +; CHECK-NEXT: [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
> +; CHECK-NEXT: ret float [[BIN_EXTRA]]
> ;
> ; THRESHOLD-LABEL: @f1(
> ; THRESHOLD-NEXT: entry:
> ; THRESHOLD-NEXT: [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
> ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[REM]] to float
> -; THRESHOLD-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4
> -; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], [[CONV]]
> -; THRESHOLD-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 1
> -; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[ADD]]
> +; THRESHOLD-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float,
> float* [[X:%.*]], i64 1
> ; THRESHOLD-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 2
> -; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP2]], [[ADD_1]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 3
> -; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP3]], [[ADD_2]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 4
> -; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP4]], [[ADD_3]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 5
> -; THRESHOLD-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP5]], [[ADD_4]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 6
> -; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_6]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP6]], [[ADD_5]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 7
> -; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_7]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_7:%.*]] = fadd fast float [[TMP7]], [[ADD_6]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 8
> -; THRESHOLD-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX_8]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_8:%.*]] = fadd fast float [[TMP8]], [[ADD_7]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 9
> -; THRESHOLD-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX_9]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_9:%.*]] = fadd fast float [[TMP9]], [[ADD_8]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 10
> -; THRESHOLD-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX_10]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_10:%.*]] = fadd fast float [[TMP10]], [[ADD_9]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 11
> -; THRESHOLD-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX_11]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_11:%.*]] = fadd fast float [[TMP11]],
> [[ADD_10]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 12
> -; THRESHOLD-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX_12]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_12:%.*]] = fadd fast float [[TMP12]],
> [[ADD_11]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 13
> -; THRESHOLD-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX_13]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_13:%.*]] = fadd fast float [[TMP13]],
> [[ADD_12]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 14
> -; THRESHOLD-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX_14]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_14:%.*]] = fadd fast float [[TMP14]],
> [[ADD_13]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 15
> -; THRESHOLD-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX_15]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_15:%.*]] = fadd fast float [[TMP15]],
> [[ADD_14]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 16
> -; THRESHOLD-NEXT: [[TMP16:%.*]] = load float, float* [[ARRAYIDX_16]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_16:%.*]] = fadd fast float [[TMP16]],
> [[ADD_15]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_17:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 17
> -; THRESHOLD-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX_17]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_17:%.*]] = fadd fast float [[TMP17]],
> [[ADD_16]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_18:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 18
> -; THRESHOLD-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX_18]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_18:%.*]] = fadd fast float [[TMP18]],
> [[ADD_17]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_19:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 19
> -; THRESHOLD-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX_19]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_19:%.*]] = fadd fast float [[TMP19]],
> [[ADD_18]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_20:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 20
> -; THRESHOLD-NEXT: [[TMP20:%.*]] = load float, float* [[ARRAYIDX_20]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_20:%.*]] = fadd fast float [[TMP20]],
> [[ADD_19]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_21:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 21
> -; THRESHOLD-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX_21]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_21:%.*]] = fadd fast float [[TMP21]],
> [[ADD_20]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_22:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 22
> -; THRESHOLD-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX_22]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_22:%.*]] = fadd fast float [[TMP22]],
> [[ADD_21]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_23:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 23
> -; THRESHOLD-NEXT: [[TMP23:%.*]] = load float, float* [[ARRAYIDX_23]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_23:%.*]] = fadd fast float [[TMP23]],
> [[ADD_22]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 24
> -; THRESHOLD-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX_24]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_24:%.*]] = fadd fast float [[TMP24]],
> [[ADD_23]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_25:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 25
> -; THRESHOLD-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX_25]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_25:%.*]] = fadd fast float [[TMP25]],
> [[ADD_24]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_26:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 26
> -; THRESHOLD-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX_26]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_26:%.*]] = fadd fast float [[TMP26]],
> [[ADD_25]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_27:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 27
> -; THRESHOLD-NEXT: [[TMP27:%.*]] = load float, float* [[ARRAYIDX_27]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_27:%.*]] = fadd fast float [[TMP27]],
> [[ADD_26]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 28
> -; THRESHOLD-NEXT: [[TMP28:%.*]] = load float, float* [[ARRAYIDX_28]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_28:%.*]] = fadd fast float [[TMP28]],
> [[ADD_27]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 29
> -; THRESHOLD-NEXT: [[TMP29:%.*]] = load float, float* [[ARRAYIDX_29]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_29:%.*]] = fadd fast float [[TMP29]],
> [[ADD_28]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_30:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 30
> -; THRESHOLD-NEXT: [[TMP30:%.*]] = load float, float* [[ARRAYIDX_30]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_30:%.*]] = fadd fast float [[TMP30]],
> [[ADD_29]]
> ; THRESHOLD-NEXT: [[ARRAYIDX_31:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 31
> -; THRESHOLD-NEXT: [[TMP31:%.*]] = load float, float* [[ARRAYIDX_31]],
> align 4
> -; THRESHOLD-NEXT: [[ADD_31:%.*]] = fadd fast float [[TMP31]],
> [[ADD_30]]
> -; THRESHOLD-NEXT: ret float [[ADD_31]]
> +; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
> +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>*
> [[TMP0]], align 4
> +; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float undef, [[CONV]]
> +; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
> +; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
> +; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
> +; THRESHOLD-NEXT: [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
> +; THRESHOLD-NEXT: [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
> +; THRESHOLD-NEXT: [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
> +; THRESHOLD-NEXT: [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
> +; THRESHOLD-NEXT: [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
> +; THRESHOLD-NEXT: [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
> +; THRESHOLD-NEXT: [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
> +; THRESHOLD-NEXT: [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
> +; THRESHOLD-NEXT: [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
> +; THRESHOLD-NEXT: [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
> +; THRESHOLD-NEXT: [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
> +; THRESHOLD-NEXT: [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
> +; THRESHOLD-NEXT: [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
> +; THRESHOLD-NEXT: [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
> +; THRESHOLD-NEXT: [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
> +; THRESHOLD-NEXT: [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
> +; THRESHOLD-NEXT: [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
> +; THRESHOLD-NEXT: [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
> +; THRESHOLD-NEXT: [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
> +; THRESHOLD-NEXT: [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
> +; THRESHOLD-NEXT: [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
> +; THRESHOLD-NEXT: [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
> +; THRESHOLD-NEXT: [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
> +; THRESHOLD-NEXT: [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
> +; THRESHOLD-NEXT: [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
> +; THRESHOLD-NEXT: [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
> +; THRESHOLD-NEXT: [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
> +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <32 x float>
> [[TMP1]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19,
> i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32
> 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]],
> [[RDX_SHUF]]
> +; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float>
> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11,
> i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX2:%.*]] = fadd fast <32 x float>
> [[BIN_RDX]], [[RDX_SHUF1]]
> +; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float>
> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <32 x float>
> [[BIN_RDX2]], [[RDX_SHUF3]]
> +; THRESHOLD-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float>
> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX6:%.*]] = fadd fast <32 x float>
> [[BIN_RDX4]], [[RDX_SHUF5]]
> +; THRESHOLD-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float>
> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX8:%.*]] = fadd fast <32 x float>
> [[BIN_RDX6]], [[RDX_SHUF7]]
> +; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <32 x float>
> [[BIN_RDX8]], i32 0
> +; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]],
> [[CONV]]
> +; THRESHOLD-NEXT: [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
> +; THRESHOLD-NEXT: ret float [[BIN_EXTRA]]
> ;
> entry:
> %rem = srem i32 %a, %b
> @@ -1396,63 +1344,69 @@ define float @extra_args(float* nocaptur
> ; CHECK-NEXT: entry:
> ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
> ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
> -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4
> ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
> -; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[TMP0]], [[ADD]]
> -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float*
> [[X]], i64 1
> -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX3]], align 4
> -; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[ADD1]]
> -; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
> +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float*
> [[X:%.*]], i64 1
> ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 2
> -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3_1]], align
> 4
> -; CHECK-NEXT: [[ADD4_1:%.*]] = fadd fast float [[TMP2]], [[ADD5]]
> ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 3
> -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX3_2]], align
> 4
> -; CHECK-NEXT: [[ADD4_2:%.*]] = fadd fast float [[TMP3]], [[ADD4_1]]
> ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 4
> -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX3_3]], align
> 4
> -; CHECK-NEXT: [[ADD4_3:%.*]] = fadd fast float [[TMP4]], [[ADD4_2]]
> ; CHECK-NEXT: [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 5
> -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX3_4]], align
> 4
> -; CHECK-NEXT: [[ADD4_4:%.*]] = fadd fast float [[TMP5]], [[ADD4_3]]
> ; CHECK-NEXT: [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 6
> -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX3_5]], align
> 4
> -; CHECK-NEXT: [[ADD4_5:%.*]] = fadd fast float [[TMP6]], [[ADD4_4]]
> ; CHECK-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 7
> -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX3_6]], align
> 4
> -; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float [[TMP7]], [[ADD4_5]]
> -; CHECK-NEXT: ret float [[ADD4_6]]
> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
> +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]],
> align 4
> +; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
> +; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
> +; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
> +; CHECK-NEXT: [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
> +; CHECK-NEXT: [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
> +; CHECK-NEXT: [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
> +; CHECK-NEXT: [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
> +; CHECK-NEXT: [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
> +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]],
> <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32
> undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]],
> [[RDX_SHUF]]
> +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float>
> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]],
> [[RDX_SHUF1]]
> +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float>
> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]],
> [[RDX_SHUF3]]
> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]],
> i32 0
> +; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
> +; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]],
> [[CONV]]
> +; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
> +; CHECK-NEXT: ret float [[BIN_EXTRA5]]
> ;
> ; THRESHOLD-LABEL: @extra_args(
> ; THRESHOLD-NEXT: entry:
> ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
> ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
> -; THRESHOLD-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4
> ; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
> -; THRESHOLD-NEXT: [[ADD1:%.*]] = fadd fast float [[TMP0]], [[ADD]]
> -; THRESHOLD-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 1
> -; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX3]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[ADD1]]
> -; THRESHOLD-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
> +; THRESHOLD-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float,
> float* [[X:%.*]], i64 1
> ; THRESHOLD-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 2
> -; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3_1]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4_1:%.*]] = fadd fast float [[TMP2]], [[ADD5]]
> ; THRESHOLD-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 3
> -; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX3_2]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4_2:%.*]] = fadd fast float [[TMP3]], [[ADD4_1]]
> ; THRESHOLD-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 4
> -; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX3_3]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4_3:%.*]] = fadd fast float [[TMP4]], [[ADD4_2]]
> ; THRESHOLD-NEXT: [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 5
> -; THRESHOLD-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX3_4]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4_4:%.*]] = fadd fast float [[TMP5]], [[ADD4_3]]
> ; THRESHOLD-NEXT: [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 6
> -; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX3_5]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4_5:%.*]] = fadd fast float [[TMP6]], [[ADD4_4]]
> ; THRESHOLD-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 7
> -; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX3_6]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float [[TMP7]], [[ADD4_5]]
> -; THRESHOLD-NEXT: ret float [[ADD4_6]]
> +; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
> +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>*
> [[TMP0]], align 4
> +; THRESHOLD-NEXT: [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
> +; THRESHOLD-NEXT: [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
> +; THRESHOLD-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
> +; THRESHOLD-NEXT: [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
> +; THRESHOLD-NEXT: [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
> +; THRESHOLD-NEXT: [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
> +; THRESHOLD-NEXT: [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
> +; THRESHOLD-NEXT: [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
> +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float>
> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32
> undef, i32 undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]],
> [[RDX_SHUF]]
> +; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float>
> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float>
> [[BIN_RDX]], [[RDX_SHUF1]]
> +; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float>
> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float>
> [[BIN_RDX2]], [[RDX_SHUF3]]
> +; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <8 x float>
> [[BIN_RDX4]], i32 0
> +; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
> +; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]],
> [[CONV]]
> +; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
> +; THRESHOLD-NEXT: ret float [[BIN_EXTRA5]]
> ;
> entry:
> %mul = mul nsw i32 %b, %a
> @@ -1490,67 +1444,73 @@ define float @extra_args_no_replace(floa
> ; CHECK-NEXT: entry:
> ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
> ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
> -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4
> ; CHECK-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
> ; CHECK-NEXT: [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
> ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
> -; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[TMP0]], [[ADD]]
> -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float*
> [[X]], i64 1
> -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX3]], align 4
> -; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[ADD1]]
> +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float*
> [[X:%.*]], i64 1
> ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 2
> -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3_1]], align
> 4
> -; CHECK-NEXT: [[ADD4_1:%.*]] = fadd fast float [[TMP2]], [[ADD4]]
> ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 3
> -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX3_2]], align
> 4
> -; CHECK-NEXT: [[ADD4_2:%.*]] = fadd fast float [[TMP3]], [[ADD4_1]]
> ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 4
> -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX3_3]], align
> 4
> -; CHECK-NEXT: [[ADD4_3:%.*]] = fadd fast float [[TMP4]], [[ADD4_2]]
> -; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
> ; CHECK-NEXT: [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 5
> -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX3_4]], align
> 4
> -; CHECK-NEXT: [[ADD4_4:%.*]] = fadd fast float [[TMP5]], [[ADD5]]
> ; CHECK-NEXT: [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 6
> -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX3_5]], align
> 4
> -; CHECK-NEXT: [[ADD4_5:%.*]] = fadd fast float [[TMP6]], [[ADD4_4]]
> ; CHECK-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 7
> -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX3_6]], align
> 4
> -; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float [[TMP7]], [[ADD4_5]]
> -; CHECK-NEXT: ret float [[ADD4_6]]
> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
> +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]],
> align 4
> +; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
> +; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
> +; CHECK-NEXT: [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]]
> +; CHECK-NEXT: [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
> +; CHECK-NEXT: [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
> +; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
> +; CHECK-NEXT: [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]]
> +; CHECK-NEXT: [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
> +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]],
> <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32
> undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]],
> [[RDX_SHUF]]
> +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float>
> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]],
> [[RDX_SHUF1]]
> +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float>
> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]],
> [[RDX_SHUF3]]
> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]],
> i32 0
> +; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
> +; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]],
> [[CONV]]
> +; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
> +; CHECK-NEXT: ret float [[BIN_EXTRA5]]
> ;
> ; THRESHOLD-LABEL: @extra_args_no_replace(
> ; THRESHOLD-NEXT: entry:
> ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
> ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
> -; THRESHOLD-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4
> ; THRESHOLD-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
> ; THRESHOLD-NEXT: [[ADDC:%.*]] = fadd fast float [[CONVC]],
> 3.000000e+00
> ; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
> -; THRESHOLD-NEXT: [[ADD1:%.*]] = fadd fast float [[TMP0]], [[ADD]]
> -; THRESHOLD-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 1
> -; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX3]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[ADD1]]
> +; THRESHOLD-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float,
> float* [[X:%.*]], i64 1
> ; THRESHOLD-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 2
> -; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3_1]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4_1:%.*]] = fadd fast float [[TMP2]], [[ADD4]]
> ; THRESHOLD-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 3
> -; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX3_2]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4_2:%.*]] = fadd fast float [[TMP3]], [[ADD4_1]]
> ; THRESHOLD-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 4
> -; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX3_3]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4_3:%.*]] = fadd fast float [[TMP4]], [[ADD4_2]]
> -; THRESHOLD-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
> ; THRESHOLD-NEXT: [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 5
> -; THRESHOLD-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX3_4]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4_4:%.*]] = fadd fast float [[TMP5]], [[ADD5]]
> ; THRESHOLD-NEXT: [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 6
> -; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX3_5]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4_5:%.*]] = fadd fast float [[TMP6]], [[ADD4_4]]
> ; THRESHOLD-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float,
> float* [[X]], i64 7
> -; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX3_6]],
> align 4
> -; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float [[TMP7]], [[ADD4_5]]
> -; THRESHOLD-NEXT: ret float [[ADD4_6]]
> +; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
> +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>*
> [[TMP0]], align 4
> +; THRESHOLD-NEXT: [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
> +; THRESHOLD-NEXT: [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
> +; THRESHOLD-NEXT: [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]]
> +; THRESHOLD-NEXT: [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
> +; THRESHOLD-NEXT: [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
> +; THRESHOLD-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
> +; THRESHOLD-NEXT: [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]]
> +; THRESHOLD-NEXT: [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
> +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float>
> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32
> undef, i32 undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]],
> [[RDX_SHUF]]
> +; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float>
> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32
> undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float>
> [[BIN_RDX]], [[RDX_SHUF1]]
> +; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float>
> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef,
> i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float>
> [[BIN_RDX2]], [[RDX_SHUF3]]
> +; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <8 x float>
> [[BIN_RDX4]], i32 0
> +; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
> +; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]],
> [[CONV]]
> +; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
> +; THRESHOLD-NEXT: ret float [[BIN_EXTRA5]]
> ;
> entry:
> %mul = mul nsw i32 %b, %a
> @@ -1588,45 +1548,59 @@ define float @extra_args_no_replace(floa
> define i32 @wobble(i32 %arg, i32 %bar) {
> ; CHECK-LABEL: @wobble(
> ; CHECK-NEXT: bb:
> -; CHECK-NEXT: [[X1:%.*]] = xor i32 [[ARG:%.*]], [[BAR:%.*]]
> -; CHECK-NEXT: [[I1:%.*]] = icmp eq i32 [[X1]], 0
> -; CHECK-NEXT: [[S1:%.*]] = sext i1 [[I1]] to i32
> -; CHECK-NEXT: [[X2:%.*]] = xor i32 [[ARG]], [[BAR]]
> -; CHECK-NEXT: [[I2:%.*]] = icmp eq i32 [[X2]], 0
> -; CHECK-NEXT: [[S2:%.*]] = sext i1 [[I2]] to i32
> -; CHECK-NEXT: [[X3:%.*]] = xor i32 [[ARG]], [[BAR]]
> -; CHECK-NEXT: [[I3:%.*]] = icmp eq i32 [[X3]], 0
> -; CHECK-NEXT: [[S3:%.*]] = sext i1 [[I3]] to i32
> -; CHECK-NEXT: [[X4:%.*]] = xor i32 [[ARG]], [[BAR]]
> -; CHECK-NEXT: [[I4:%.*]] = icmp eq i32 [[X4]], 0
> -; CHECK-NEXT: [[S4:%.*]] = sext i1 [[I4]] to i32
> -; CHECK-NEXT: [[R1:%.*]] = add i32 [[ARG]], [[S1]]
> -; CHECK-NEXT: [[R2:%.*]] = add i32 [[R1]], [[S2]]
> -; CHECK-NEXT: [[R3:%.*]] = add i32 [[R2]], [[S3]]
> -; CHECK-NEXT: [[R4:%.*]] = add i32 [[R3]], [[S4]]
> -; CHECK-NEXT: [[R5:%.*]] = add i32 [[R4]], [[X4]]
> -; CHECK-NEXT: ret i32 [[R5]]
> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32
> [[ARG:%.*]], i32 0
> +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32
> [[ARG]], i32 1
> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32
> [[ARG]], i32 2
> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32
> [[ARG]], i32 3
> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32
> [[BAR:%.*]], i32 0
> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32
> [[BAR]], i32 1
> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32
> [[BAR]], i32 2
> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32
> [[BAR]], i32 3
> +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i32> [[TMP3]], [[TMP7]]
> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
> +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]],
> zeroinitializer
> +; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
> +; CHECK-NEXT: [[R1:%.*]] = add i32 [[ARG]], undef
> +; CHECK-NEXT: [[R2:%.*]] = add i32 [[R1]], undef
> +; CHECK-NEXT: [[R3:%.*]] = add i32 [[R2]], undef
> +; CHECK-NEXT: [[R4:%.*]] = add i32 [[R3]], undef
> +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4
> x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]]
> +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]],
> <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]],
> [[RDX_SHUF1]]
> +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]],
> i32 0
> +; CHECK-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP12]], [[ARG]]
> +; CHECK-NEXT: [[BIN_EXTRA3:%.*]] = add i32 [[BIN_EXTRA]], [[TMP9]]
> +; CHECK-NEXT: [[R5:%.*]] = add i32 [[R4]], undef
> +; CHECK-NEXT: ret i32 [[BIN_EXTRA3]]
> ;
> ; THRESHOLD-LABEL: @wobble(
> ; THRESHOLD-NEXT: bb:
> -; THRESHOLD-NEXT: [[X1:%.*]] = xor i32 [[ARG:%.*]], [[BAR:%.*]]
> -; THRESHOLD-NEXT: [[I1:%.*]] = icmp eq i32 [[X1]], 0
> -; THRESHOLD-NEXT: [[S1:%.*]] = sext i1 [[I1]] to i32
> -; THRESHOLD-NEXT: [[X2:%.*]] = xor i32 [[ARG]], [[BAR]]
> -; THRESHOLD-NEXT: [[I2:%.*]] = icmp eq i32 [[X2]], 0
> -; THRESHOLD-NEXT: [[S2:%.*]] = sext i1 [[I2]] to i32
> -; THRESHOLD-NEXT: [[X3:%.*]] = xor i32 [[ARG]], [[BAR]]
> -; THRESHOLD-NEXT: [[I3:%.*]] = icmp eq i32 [[X3]], 0
> -; THRESHOLD-NEXT: [[S3:%.*]] = sext i1 [[I3]] to i32
> -; THRESHOLD-NEXT: [[X4:%.*]] = xor i32 [[ARG]], [[BAR]]
> -; THRESHOLD-NEXT: [[I4:%.*]] = icmp eq i32 [[X4]], 0
> -; THRESHOLD-NEXT: [[S4:%.*]] = sext i1 [[I4]] to i32
> -; THRESHOLD-NEXT: [[R1:%.*]] = add i32 [[ARG]], [[S1]]
> -; THRESHOLD-NEXT: [[R2:%.*]] = add i32 [[R1]], [[S2]]
> -; THRESHOLD-NEXT: [[R3:%.*]] = add i32 [[R2]], [[S3]]
> -; THRESHOLD-NEXT: [[R4:%.*]] = add i32 [[R3]], [[S4]]
> -; THRESHOLD-NEXT: [[R5:%.*]] = add i32 [[R4]], [[X4]]
> -; THRESHOLD-NEXT: ret i32 [[R5]]
> +; THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32
> [[ARG:%.*]], i32 0
> +; THRESHOLD-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32
> [[ARG]], i32 1
> +; THRESHOLD-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32
> [[ARG]], i32 2
> +; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32
> [[ARG]], i32 3
> +; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32
> [[BAR:%.*]], i32 0
> +; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32
> [[BAR]], i32 1
> +; THRESHOLD-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32
> [[BAR]], i32 2
> +; THRESHOLD-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32
> [[BAR]], i32 3
> +; THRESHOLD-NEXT: [[TMP8:%.*]] = xor <4 x i32> [[TMP3]], [[TMP7]]
> +; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]],
> i32 3
> +; THRESHOLD-NEXT: [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]],
> zeroinitializer
> +; THRESHOLD-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
> +; THRESHOLD-NEXT: [[R1:%.*]] = add i32 [[ARG]], undef
> +; THRESHOLD-NEXT: [[R2:%.*]] = add i32 [[R1]], undef
> +; THRESHOLD-NEXT: [[R3:%.*]] = add i32 [[R2]], undef
> +; THRESHOLD-NEXT: [[R4:%.*]] = add i32 [[R3]], undef
> +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32>
> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]],
> [[RDX_SHUF]]
> +; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32>
> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32
> undef>
> +; THRESHOLD-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]],
> [[RDX_SHUF1]]
> +; THRESHOLD-NEXT: [[TMP12:%.*]] = extractelement <4 x i32>
> [[BIN_RDX2]], i32 0
> +; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP12]], [[ARG]]
> +; THRESHOLD-NEXT: [[BIN_EXTRA3:%.*]] = add i32 [[BIN_EXTRA]], [[TMP9]]
> +; THRESHOLD-NEXT: [[R5:%.*]] = add i32 [[R4]], undef
> +; THRESHOLD-NEXT: ret i32 [[BIN_EXTRA3]]
> ;
> bb:
> %x1 = xor i32 %arg, %bar
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20170221/85c8a568/attachment.html>
More information about the llvm-commits
mailing list