[llvm] c2441b6 - [LoopVectorize] Add vector reduction support for fmuladd intrinsic
Rosie Sumpter via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 24 00:59:56 PST 2021
Author: Rosie Sumpter
Date: 2021-11-24T08:50:04Z
New Revision: c2441b6b89bfe52a16f6c5ed5a0a49c9a02daf2a
URL: https://github.com/llvm/llvm-project/commit/c2441b6b89bfe52a16f6c5ed5a0a49c9a02daf2a
DIFF: https://github.com/llvm/llvm-project/commit/c2441b6b89bfe52a16f6c5ed5a0a49c9a02daf2a.diff
LOG: [LoopVectorize] Add vector reduction support for fmuladd intrinsic
Enables LoopVectorize to handle reduction patterns involving the
llvm.fmuladd intrinsic.
Differential Revision: https://reviews.llvm.org/D111555
Added:
Modified:
llvm/include/llvm/Analysis/IVDescriptors.h
llvm/lib/Analysis/IVDescriptors.cpp
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
llvm/lib/Transforms/Utils/LoopUtils.cpp
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index c26dbc4579498..ea4c0312e073d 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -20,6 +20,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Support/Casting.h"
@@ -50,6 +51,7 @@ enum class RecurKind {
FMul, ///< Product of floats.
FMin, ///< FP min implemented in terms of select(cmp()).
FMax, ///< FP max implemented in terms of select(cmp()).
+ FMulAdd, ///< Fused multiply-add of floats (a * b + c).
SelectICmp, ///< Integer select(icmp(),x,y) where one of (x,y) is loop
///< invariant
SelectFCmp ///< Integer select(fcmp(),x,y) where one of (x,y) is loop
@@ -260,6 +262,12 @@ class RecurrenceDescriptor {
SmallVector<Instruction *, 4> getReductionOpChain(PHINode *Phi,
Loop *L) const;
+ /// Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
+ static bool isFMulAddIntrinsic(Instruction *I) {
+ return isa<IntrinsicInst>(I) &&
+ cast<IntrinsicInst>(I)->getIntrinsicID() == Intrinsic::fmuladd;
+ }
+
private:
// The starting value of the recurrence.
// It does not have to be zero!
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index c4b7239b43abc..cfe910df4e919 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -81,6 +81,7 @@ bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurKind Kind) {
case RecurKind::Mul:
case RecurKind::FAdd:
case RecurKind::FMul:
+ case RecurKind::FMulAdd:
return true;
}
return false;
@@ -194,21 +195,28 @@ static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
// vectorizing floating point operations without unsafe math.
static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
Instruction *Exit, PHINode *Phi) {
- // Currently only FAdd is supported
- if (Kind != RecurKind::FAdd)
+ // Currently only FAdd and FMulAdd are supported.
+ if (Kind != RecurKind::FAdd && Kind != RecurKind::FMulAdd)
return false;
- // Ensure the exit instruction is an FAdd, and that it only has one user
- // other than the reduction PHI
- if (Exit->getOpcode() != Instruction::FAdd || Exit->hasNUsesOrMore(3) ||
- Exit != ExactFPMathInst)
+ if (Kind == RecurKind::FAdd && Exit->getOpcode() != Instruction::FAdd)
+ return false;
+
+ if (Kind == RecurKind::FMulAdd &&
+ !RecurrenceDescriptor::isFMulAddIntrinsic(Exit))
+ return false;
+
+ // Ensure the exit instruction has only one user other than the reduction PHI
+ if (Exit != ExactFPMathInst || Exit->hasNUsesOrMore(3))
return false;
// The only pattern accepted is the one in which the reduction PHI
// is used as one of the operands of the exit instruction
- auto *LHS = Exit->getOperand(0);
- auto *RHS = Exit->getOperand(1);
- if (LHS != Phi && RHS != Phi)
+ auto *Op0 = Exit->getOperand(0);
+ auto *Op1 = Exit->getOperand(1);
+ if (Kind == RecurKind::FAdd && Op0 != Phi && Op1 != Phi)
+ return false;
+ if (Kind == RecurKind::FMulAdd && Exit->getOperand(2) != Phi)
return false;
LLVM_DEBUG(dbgs() << "LV: Found an ordered reduction: Phi: " << *Phi
@@ -389,6 +397,12 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
for (User *U : Cur->users()) {
Instruction *UI = cast<Instruction>(U);
+ // If the user is a call to llvm.fmuladd then the instruction can only be
+ // the final operand.
+ if (isFMulAddIntrinsic(UI))
+ if (Cur == UI->getOperand(0) || Cur == UI->getOperand(1))
+ return false;
+
// Check if we found the exit user.
BasicBlock *Parent = UI->getParent();
if (!TheLoop->contains(Parent)) {
@@ -710,6 +724,9 @@ RecurrenceDescriptor::isRecurrenceInstr(Loop *L, PHINode *OrigPhi,
I->hasNoSignedZeros())) &&
isFPMinMaxRecurrenceKind(Kind)))
return isMinMaxPattern(I, Kind, Prev);
+ else if (isFMulAddIntrinsic(I))
+ return InstDesc(Kind == RecurKind::FMulAdd, I,
+ I->hasAllowReassoc() ? nullptr : I);
return InstDesc(false, I);
}
}
@@ -804,6 +821,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
<< " PHI." << *Phi << "\n");
return true;
}
+ if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC,
+ DT)) {
+ LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n");
+ return true;
+ }
// Not a reduction of known type.
return false;
}
@@ -927,6 +949,7 @@ Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp,
case RecurKind::FMul:
// Multiplying a number by 1 does not change it.
return ConstantFP::get(Tp, 1.0L);
+ case RecurKind::FMulAdd:
case RecurKind::FAdd:
// Adding zero to a number does not change it.
// FIXME: Ideally we should not need to check FMF for FAdd and should always
@@ -974,6 +997,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
return Instruction::Xor;
case RecurKind::FMul:
return Instruction::FMul;
+ case RecurKind::FMulAdd:
case RecurKind::FAdd:
return Instruction::FAdd;
case RecurKind::SMax:
@@ -1032,6 +1056,10 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
return SelectPatternResult::isMinOrMax(
matchSelectPattern(Cur, LHS, RHS).Flavor);
}
+ // Recognize a call to the llvm.fmuladd intrinsic.
+ if (isFMulAddIntrinsic(Cur))
+ return true;
+
return Cur->getOpcode() == RedOp;
};
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 63d6fa5bbb264..ea21ad1f3cc50 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2142,6 +2142,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
case RecurKind::FMax:
case RecurKind::SelectICmp:
case RecurKind::SelectFCmp:
+ case RecurKind::FMulAdd:
return true;
default:
return false;
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 68572d4797425..c8e42acdffb39 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1049,6 +1049,7 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
return Builder.CreateOrReduce(Src);
case RecurKind::Xor:
return Builder.CreateXorReduce(Src);
+ case RecurKind::FMulAdd:
case RecurKind::FAdd:
return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy),
Src);
@@ -1091,7 +1092,8 @@ Value *llvm::createTargetReduction(IRBuilderBase &B,
Value *llvm::createOrderedReduction(IRBuilderBase &B,
const RecurrenceDescriptor &Desc,
Value *Src, Value *Start) {
- assert(Desc.getRecurrenceKind() == RecurKind::FAdd &&
+ assert((Desc.getRecurrenceKind() == RecurKind::FAdd ||
+ Desc.getRecurrenceKind() == RecurKind::FMulAdd) &&
"Unexpected reduction kind");
assert(Src->getType()->isVectorTy() && "Expected a vector type");
assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 79687a000b760..fcb0640f4a1aa 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9775,12 +9775,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
unsigned FirstOpId;
assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
"Only min/max recurrences allowed for inloop reductions");
+ // Recognize a call to the llvm.fmuladd intrinsic.
+ bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
+ assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
+ "Expected instruction to be a call to the llvm.fmuladd intrinsic");
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
"Expected to replace a VPWidenSelectSC");
FirstOpId = 1;
} else {
- assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) &&
+ assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
+ (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
"Expected to replace a VPWidenSC");
FirstOpId = 0;
}
@@ -9791,8 +9796,19 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
auto *CondOp = CM.foldTailByMasking()
? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
: nullptr;
- VPReductionRecipe *RedRecipe = new VPReductionRecipe(
- &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
+
+ if (IsFMulAdd) {
+ // If the instruction is a call to the llvm.fmuladd intrinsic then we
+ // need to create an fmul recipe to use as the vector operand for the
+ // fadd reduction.
+ VPInstruction *FMulRecipe = new VPInstruction(
+ Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
+ WidenRecipe->getParent()->insert(FMulRecipe,
+ WidenRecipe->getIterator());
+ VecOp = FMulRecipe;
+ }
+ VPReductionRecipe *RedRecipe =
+ new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
Plan->removeVPValueFor(R);
Plan->addVPValue(R, RedRecipe);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e3ef0b794f68c..d64eb05700c91 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8791,6 +8791,8 @@ class HorizontalReduction {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
+ assert(RdxKind != RecurKind::FMulAdd &&
+ "A call to the llvm.fmuladd intrinsic is not handled yet");
++NumVectorInstructions;
return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index bd5c2929cbe11..94dfa876e42a7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -390,6 +390,168 @@ for.end: ; preds = %for.body
ret float %rdx
}
+; Test case where loop has a call to the llvm.fmuladd intrinsic.
+define float @fmuladd_strict(float* %a, float* %b, i64 %n) #0 {
+; CHECK-ORDERED-LABEL: @fmuladd_strict
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ]
+; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[FMUL]])
+; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], <vscale x 8 x float> [[FMUL1]])
+; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], <vscale x 8 x float> [[FMUL2]])
+; CHECK-ORDERED: [[RDX3]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], <vscale x 8 x float> [[FMUL3]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fmuladd_strict
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[FMULADD]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD1]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD1]], <vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[VEC_PHI1]])
+; CHECK-UNORDERED: [[FMULADD2]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[VEC_PHI2]])
+; CHECK-UNORDERED: [[FMULADD3]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD3]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI3]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[FMULADD1]], [[FMULADD]]
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <vscale x 8 x float> [[FMULADD2]], [[BIN_RDX]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <vscale x 8 x float> [[FMULADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX2]]
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+ %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+ %1 = load float, float* %arrayidx2, align 4
+ %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret float %muladd
+}
+
+; Same as above but where the call to the llvm.fmuladd intrinsic has a fast-math flag.
+define float @fmuladd_strict_fmf(float* %a, float* %b, i64 %n) #0 {
+; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ]
+; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[FMUL]])
+; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], <vscale x 8 x float> [[FMUL1]])
+; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], <vscale x 8 x float> [[FMUL2]])
+; CHECK-ORDERED: [[RDX3]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], <vscale x 8 x float> [[FMUL3]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[FMULADD]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD1]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD1]], <vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[VEC_PHI1]])
+; CHECK-UNORDERED: [[FMULADD2]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[VEC_PHI2]])
+; CHECK-UNORDERED: [[FMULADD3]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD3]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI3]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD1]], [[FMULADD]]
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD2]], [[BIN_RDX]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX2]]
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict_fmf
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+ %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+ %1 = load float, float* %arrayidx2, align 4
+ %muladd = tail call nnan float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret float %muladd
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
attributes #0 = { vscale_range(0, 16) }
!0 = distinct !{!0, !3, !6, !8}
!1 = distinct !{!1, !3, !7, !8}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
index 60c9e4e0fc5ef..e890468f06ddf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -942,6 +942,7 @@ define float @fadd_multiple_use(i64 %n) {
; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_use
; CHECK-NOT-VECTORIZED-NOT: vector.body
+
entry:
br label %for.body
@@ -967,6 +968,347 @@ for.end:
ret float %phi3
}
+; Test case where the loop has a call to the llvm.fmuladd intrinsic.
+define float @fmuladd_strict(float* %a, float* %b, i64 %n) {
+; CHECK-ORDERED-LABEL: @fmuladd_strict
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ]
+; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[FMUL]])
+; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX]], <8 x float> [[FMUL1]])
+; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX1]], <8 x float> [[FMUL2]])
+; CHECK-ORDERED: [[RDX3]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX2]], <8 x float> [[FMUL3]])
+; CHECK-ORDERED: for.body:
+; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-ORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX3]], %middle.block ]
+
+; CHECK-UNORDERED-LABEL: @fmuladd_strict
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[FMULADD]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]])
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]])
+; CHECK-UNORDERED: for.end:
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+ %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+ %1 = load float, float* %arrayidx2, align 4
+ %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret float %muladd
+}
+
+; Test reductions for a VF of 1 and a UF > 1 where the loop has a call to the llvm.fmuladd intrinsic.
+define float @fmuladd_scalar_vf(float* %a, float* %b, i64 %n) {
+; CHECK-ORDERED-LABEL: @fmuladd_scalar_vf
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD3:%.*]], %vector.body ]
+; CHECK-ORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD3:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD4:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD5:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD6:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD7:%.*]] = load float, float*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul float [[LOAD]], [[LOAD4]]
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul float [[LOAD1]], [[LOAD5]]
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul float [[LOAD2]], [[LOAD6]]
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul float [[LOAD3]], [[LOAD7]]
+; CHECK-ORDERED: [[FADD:%.*]] = fadd float [[VEC_PHI]], [[FMUL]]
+; CHECK-ORDERED: [[FADD1:%.*]] = fadd float [[FADD]], [[FMUL1]]
+; CHECK-ORDERED: [[FADD2:%.*]] = fadd float [[FADD1]], [[FMUL2]]
+; CHECK-ORDERED: [[FADD3]] = fadd float [[FADD2]], [[FMUL3]]
+; CHECK-ORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-ORDERED: scalar.ph
+; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[FADD3]], %middle.block ]
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-ORDERED: [[LOAD8:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD9:%.*]] = load float, float*
+; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[FADD3]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fmuladd_scalar_vf
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD6:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD7:%.*]] = load float, float*
+; CHECK-UNORDERED: [[FMULADD]] = call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD4]], float [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD1]] = call float @llvm.fmuladd.f32(float [[LOAD1]], float [[LOAD5]], float [[VEC_PHI1]])
+; CHECK-UNORDERED: [[FMULADD2]] = call float @llvm.fmuladd.f32(float [[LOAD2]], float [[LOAD6]], float [[VEC_PHI2]])
+; CHECK-UNORDERED: [[FMULADD3]] = call float @llvm.fmuladd.f32(float [[LOAD3]], float [[LOAD7]], float [[VEC_PHI3]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd float [[FMULADD1]], [[FMULADD]]
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd float [[FMULADD2]], [[BIN_RDX]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd float [[FMULADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: scalar.ph:
+; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX2]], %middle.block ]
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD8:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD9:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]])
+; CHECK-UNORDERED: for.end:
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[BIN_RDX2]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_scalar_vf
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+ %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+ %1 = load float, float* %arrayidx2, align 4
+ %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+ ret float %muladd
+}
+
+; Test case where the reduction phi is one of the mul operands of the fmuladd.
+define float @fmuladd_phi_is_mul_operand(float* %a, float* %b, i64 %n) {
+; CHECK-ORDERED-LABEL: @fmuladd_phi_is_mul_operand
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_mul_operand
+; CHECK-UNORDERED-NOT: vector.body
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_mul_operand
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+ %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+ %1 = load float, float* %arrayidx2, align 4
+ %muladd = tail call float @llvm.fmuladd.f32(float %sum.07, float %0, float %1)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret float %muladd
+}
+
+; Test case where the reduction phi is two operands of the fmuladd.
+define float @fmuladd_phi_is_two_operands(float* %a, i64 %n) {
+; CHECK-ORDERED-LABEL: @fmuladd_phi_is_two_operands
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_two_operands
+; CHECK-UNORDERED-NOT: vector.body
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_two_operands
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+ %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+ %0 = load float, float* %arrayidx, align 4
+ %muladd = tail call float @llvm.fmuladd.f32(float %sum.07, float %0, float %sum.07)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret float %muladd
+}
+
+; Test case with multiple calls to llvm.fmuladd, which is not safe to reorder
+; so is only vectorized in the unordered (fast) case.
+define float @fmuladd_multiple(float* %a, float* %b, i64 %n) {
+; CHECK-ORDERED-LABEL: @fmuladd_multiple
+; CHECK-ORDERED-NOT: vector.body:
+
+; CHECK-UNORDERED-LABEL: @fmuladd_multiple
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[FMULADD:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD2]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[FMULADD]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]])
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD2:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]])
+; CHECK-UNORDERED: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[MULADD]])
+; CHECK-UNORDERED: for.end:
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD2]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_multiple
+; CHECK-NOT-VECTORIZED-NOT: vector.body:
+
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd2, %for.body ]
+ %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+ %1 = load float, float* %arrayidx2, align 4
+ %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+ %muladd2 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %muladd)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret float %muladd2
+}
+
+; Same as above but the first fmuladd is one of the mul operands of the second fmuladd.
+define float @multiple_fmuladds_mul_operand(float* %a, float* %b, i64 %n) {
+; CHECK-ORDERED-LABEL: @multiple_fmuladds_mul_operand
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @multiple_fmuladds_mul_operand
+; CHECK-UNORDERED-NOT: vector.body
+
+; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_mul_operand
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd2, %for.body ]
+ %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+ %1 = load float, float* %arrayidx2, align 4
+ %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+ %muladd2 = tail call float @llvm.fmuladd.f32(float %0, float %muladd, float %1)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret float %muladd2
+}
+
+; Same as above but the first fmuladd is two of the operands of the second fmuladd.
+define float @multiple_fmuladds_two_operands(float* %a, float* %b, i64 %n) {
+; CHECK-ORDERED-LABEL: @multiple_fmuladds_two_operands
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @multiple_fmuladds_two_operands
+; CHECK-UNORDERED-NOT: vector.body
+
+; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_two_operands
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd2, %for.body ]
+ %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+ %1 = load float, float* %arrayidx2, align 4
+ %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+ %muladd2 = tail call float @llvm.fmuladd.f32(float %0, float %muladd, float %muladd)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+ ret float %muladd2
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
!0 = distinct !{!0, !5, !9, !11}
!1 = distinct !{!1, !5, !10, !11}
!2 = distinct !{!2, !6, !9, !11}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index 01b26818fa8c0..0dfc8a8886b4b 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -1091,6 +1091,73 @@ entry:
ret i8 %ret
}
+; Test case when loop has a call to the llvm.fmuladd intrinsic.
+define float @reduction_fmuladd(float* %a, float* %b, i64 %n) {
+; CHECK-LABEL: @reduction_fmuladd(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6]] = fadd float [[TMP5]], [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
+; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
+; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP8]], float [[TMP9]], float [[SUM_07]])
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
+; CHECK: for.end:
+; CHECK-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret float [[MULADD_LCSSA]]
+
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+ %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+ %1 = load float, float* %arrayidx2, align 4
+ %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+ ret float %muladd
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
!6 = distinct !{!6, !7, !8}
!7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
!8 = !{!"llvm.loop.vectorize.enable", i1 true}
More information about the llvm-commits
mailing list