[llvm] [IVDescriptors] Support reductions with minimumnum/maximumnum. (PR #137335)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 25 07:13:06 PDT 2025
https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/137335
Add a new reduction recurrence kind for reductions with minimumnum/maximumnum. Such reductions can be vectorized without nsz/nnans, same as reductions with maximum/minimum intrinsics.
Note that a new reduction kind is needed to make sure partial reductions are also combined with minimumnum/maximumnum.
Note that the final reduction to a scalar value is performed with vector.reduce.fmin/fmax. This should be fine, as the results of the partial reductions with maximumnum/minimumnum silences any sNaNs.
In-loop and reductions in SLP are not supported yet, as there's no reduction version of maximumnum/minimumnum yet and fmax may be incorrect.
>From 8b08c271863561feaf838993815f29d5dcc18f67 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 23 Apr 2025 15:45:36 +0100
Subject: [PATCH] [IVDescriptors] Support reductions with
minimumnum/maximumnum.
Add a new reduction recurrence kind for reductions with
minimumnum/maximumnum. Such reductions can be vectorized without
nsz/nnans, same as reductions with maximum/minimum intrinsics.
Note that a new reduction kind is needed to make sure partial reductions
are also combined with minimumnum/maximumnum.
Note that the final reduction to a scalar value is performed with
vector.reduce.fmin/fmax. This should be fine, as the results of the
partial reductions with maximumnum/minimumnum silences any sNaNs.
In-loop and reductions in SLP are not supported yet, as there's no
reduction version of maximumnum/minimumnum yet and fmax may be
incorrect.
---
llvm/include/llvm/Analysis/IVDescriptors.h | 5 +-
llvm/lib/Analysis/IVDescriptors.cpp | 29 +++-
llvm/lib/Transforms/Utils/LoopUtils.cpp | 13 +-
.../Transforms/Vectorize/SLPVectorizer.cpp | 10 +-
.../minimumnum-maximumnum-reductions.ll | 164 +++++++++++++++---
.../AArch64/reduce-maximumnum-minimumnum.ll | 62 +++++++
6 files changed, 249 insertions(+), 34 deletions(-)
create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/reduce-maximumnum-minimumnum.ll
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 42bccdc028461..140edff13a67f 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -47,6 +47,8 @@ enum class RecurKind {
FMax, ///< FP max implemented in terms of select(cmp()).
FMinimum, ///< FP min with llvm.minimum semantics
FMaximum, ///< FP max with llvm.maximum semantics
+ FMinimumNum, ///< FP min with llvm.minimumnum semantics
+ FMaximumNum, ///< FP max with llvm.maximumnum semantics
FMulAdd, ///< Sum of float products with llvm.fmuladd(a * b + sum).
IAnyOf, ///< Any_of reduction with select(icmp(),x,y) where one of (x,y) is
///< loop invariant, and both x and y are integer type.
@@ -239,7 +241,8 @@ class RecurrenceDescriptor {
/// Returns true if the recurrence kind is a floating-point min/max kind.
static bool isFPMinMaxRecurrenceKind(RecurKind Kind) {
return Kind == RecurKind::FMin || Kind == RecurKind::FMax ||
- Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum;
+ Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum ||
+ Kind == RecurKind::FMinimumNum || Kind == RecurKind::FMaximumNum;
}
/// Returns true if the recurrence kind is any min/max kind.
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 94c347b01bbfb..e7e0bef048f71 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -788,6 +788,10 @@ RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
return InstDesc(Kind == RecurKind::FMin, I);
if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
return InstDesc(Kind == RecurKind::FMax, I);
+ if (match(I, m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())))
+ return InstDesc(Kind == RecurKind::FMinimumNum, I);
+ if (match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value())))
+ return InstDesc(Kind == RecurKind::FMaximumNum, I);
if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
return InstDesc(Kind == RecurKind::FMinimum, I);
if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
@@ -892,10 +896,14 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
return true;
if (isa<FPMathOperator>(I) && I->hasNoNaNs() && I->hasNoSignedZeros())
return true;
- // minimum and maximum intrinsics do not require nsz and nnan flags since
- // NaN and signed zeroes are propagated in the intrinsic implementation.
+ // minimum/minnum and maximum/maxnum intrinsics do not require nsz and nnan
+ // flags since NaN and signed zeroes are propagated in the intrinsic
+ // implementation.
return match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())) ||
- match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value()));
+ match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())) ||
+ match(I,
+ m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())) ||
+ match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value()));
};
if (isIntMinMaxRecurrenceKind(Kind) ||
(HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind)))
@@ -1035,6 +1043,19 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
LLVM_DEBUG(dbgs() << "Found a float MINIMUM reduction PHI." << *Phi << "\n");
return true;
}
+ if (AddReductionVar(Phi, RecurKind::FMaximumNum, TheLoop, FMF, RedDes, DB, AC,
+ DT, SE)) {
+ LLVM_DEBUG(dbgs() << "Found a float MAXIMUMNUM reduction PHI." << *Phi
+ << "\n");
+ return true;
+ }
+ if (AddReductionVar(Phi, RecurKind::FMinimumNum, TheLoop, FMF, RedDes, DB, AC,
+ DT, SE)) {
+ LLVM_DEBUG(dbgs() << "Found a float MINIMUMNUM reduction PHI." << *Phi
+ << "\n");
+ return true;
+ }
+
// Not a reduction of known type.
return false;
}
@@ -1155,6 +1176,8 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
case RecurKind::FMin:
case RecurKind::FMaximum:
case RecurKind::FMinimum:
+ case RecurKind::FMaximumNum:
+ case RecurKind::FMinimumNum:
case RecurKind::FAnyOf:
case RecurKind::FFindLastIV:
return Instruction::FCmp;
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index f57d95e7722dc..2fff9521017ff 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -958,6 +958,10 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
return Intrinsic::vector_reduce_fmaximum;
case RecurKind::FMinimum:
return Intrinsic::vector_reduce_fminimum;
+ case RecurKind::FMaximumNum:
+ return Intrinsic::vector_reduce_fmax;
+ case RecurKind::FMinimumNum:
+ return Intrinsic::vector_reduce_fmin;
}
}
@@ -1053,6 +1057,10 @@ Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) {
return Intrinsic::minimum;
case RecurKind::FMaximum:
return Intrinsic::maximum;
+ case RecurKind::FMinimumNum:
+ return Intrinsic::minimumnum;
+ case RecurKind::FMaximumNum:
+ return Intrinsic::maximumnum;
}
}
@@ -1101,7 +1109,8 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
Value *Right) {
Type *Ty = Left->getType();
if (Ty->isIntOrIntVectorTy() ||
- (RK == RecurKind::FMinimum || RK == RecurKind::FMaximum)) {
+ (RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
+ RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum)) {
// TODO: Add float minnum/maxnum support when FMF nnan is set.
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK);
return Builder.CreateIntrinsic(Ty, Id, {Left, Right}, nullptr,
@@ -1320,6 +1329,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
case RecurKind::FMin:
case RecurKind::FMinimum:
case RecurKind::FMaximum:
+ case RecurKind::FMinimumNum:
+ case RecurKind::FMaximumNum:
return Builder.CreateUnaryIntrinsic(getReductionIntrinsicID(RdxKind), Src);
case RecurKind::FMulAdd:
case RecurKind::FAdd:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 53da78ee599b7..56a3bf74814b5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21774,7 +21774,9 @@ class HorizontalReduction {
case RecurKind::FMax:
case RecurKind::FMin:
case RecurKind::FMaximum:
- case RecurKind::FMinimum: {
+ case RecurKind::FMinimum:
+ case RecurKind::FMaximumNum:
+ case RecurKind::FMinimumNum: {
Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(Kind);
return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
}
@@ -23086,6 +23088,8 @@ class HorizontalReduction {
case RecurKind::FAnyOf:
case RecurKind::IFindLastIV:
case RecurKind::FFindLastIV:
+ case RecurKind::FMaximumNum:
+ case RecurKind::FMinimumNum:
case RecurKind::None:
llvm_unreachable("Unexpected reduction kind for repeated scalar.");
}
@@ -23220,6 +23224,8 @@ class HorizontalReduction {
case RecurKind::FAnyOf:
case RecurKind::IFindLastIV:
case RecurKind::FFindLastIV:
+ case RecurKind::FMaximumNum:
+ case RecurKind::FMinimumNum:
case RecurKind::None:
llvm_unreachable("Unexpected reduction kind for repeated scalar.");
}
@@ -23319,6 +23325,8 @@ class HorizontalReduction {
case RecurKind::FAnyOf:
case RecurKind::IFindLastIV:
case RecurKind::FFindLastIV:
+ case RecurKind::FMaximumNum:
+ case RecurKind::FMinimumNum:
case RecurKind::None:
llvm_unreachable("Unexpected reduction kind for reused scalars.");
}
diff --git a/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll b/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll
index eb6dcc72df57e..6dde2b9adc7c8 100644
--- a/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll
@@ -6,18 +6,42 @@ define float @maximumnum_intrinsic(ptr readonly %x) {
; CHECK-LABEL: define float @maximumnum_intrinsic(
; CHECK-SAME: ptr readonly [[X:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 2
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3]] = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[VEC_PHI]], <2 x float> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP4]] = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[VEC_PHI1]], <2 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[IV]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> [[RDX_MINMAX]])
+; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]]
-; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP]], align 4
+; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]]
+; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP1]], align 4
; CHECK-NEXT: [[RED_NEXT]] = tail call float @llvm.maximumnum.f32(float [[RED]], float [[L]])
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[INC]], 1024
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]]
;
entry:
@@ -41,18 +65,42 @@ define float @maximumnum_intrinsic_fast(ptr readonly %x) {
; CHECK-LABEL: define float @maximumnum_intrinsic_fast(
; CHECK-SAME: ptr readonly [[X:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 2
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3]] = call fast <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[VEC_PHI]], <2 x float> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP4]] = call fast <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[VEC_PHI1]], <2 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[IV]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call fast <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> [[RDX_MINMAX]])
+; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]]
-; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP]], align 4
+; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]]
+; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP1]], align 4
; CHECK-NEXT: [[RED_NEXT]] = tail call fast float @llvm.maximumnum.f32(float [[RED]], float [[L]])
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[INC]], 1024
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]]
;
entry:
@@ -76,18 +124,42 @@ define float @minimumnum_intrinsic(ptr readonly %x) {
; CHECK-LABEL: define float @minimumnum_intrinsic(
; CHECK-SAME: ptr readonly [[X:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 2
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3]] = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[VEC_PHI]], <2 x float> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP4]] = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[VEC_PHI1]], <2 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[IV]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> [[RDX_MINMAX]])
+; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]]
-; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP]], align 4
+; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]]
+; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP1]], align 4
; CHECK-NEXT: [[RED_NEXT]] = tail call float @llvm.minimumnum.f32(float [[RED]], float [[L]])
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[INC]], 1024
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]]
;
entry:
@@ -111,18 +183,42 @@ define float @minimumnum_intrinsic_fast(ptr readonly %x) {
; CHECK-LABEL: define float @minimumnum_intrinsic_fast(
; CHECK-SAME: ptr readonly [[X:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 2
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3]] = call fast <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[VEC_PHI]], <2 x float> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP4]] = call fast <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[VEC_PHI1]], <2 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[IV]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call fast <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> [[RDX_MINMAX]])
+; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]]
-; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP]], align 4
+; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]]
+; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP1]], align 4
; CHECK-NEXT: [[RED_NEXT]] = tail call fast float @llvm.minimumnum.f32(float [[RED]], float [[L]])
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[INC]], 1024
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]]
;
entry:
@@ -144,3 +240,15 @@ exit:
declare float @llvm.minimumnum.f32(float, float)
declare float @llvm.maximumnum.f32(float, float)
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-maximumnum-minimumnum.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-maximumnum-minimumnum.ll
new file mode 100644
index 0000000000000..bb7695794f0b2
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-maximumnum-minimumnum.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-macosx -S %s | FileCheck %s
+
+declare float @llvm.maximumnum.f32(float, float)
+declare float @llvm.minimumnum.f32(float, float)
+
+; TODO: Need reduction version of maximumnum/minimumnum.
+define float @reduction_v4f32_maximumnum(ptr %p) {
+; CHECK-LABEL: define float @reduction_v4f32_maximumnum
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[P]], i64 1
+; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
+; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
+; CHECK-NEXT: [[T0:%.*]] = load float, ptr [[P]], align 4
+; CHECK-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT: [[M1:%.*]] = tail call float @llvm.maximumnum.f32(float [[T1]], float [[T0]])
+; CHECK-NEXT: [[M2:%.*]] = tail call float @llvm.maximumnum.f32(float [[T2]], float [[M1]])
+; CHECK-NEXT: [[M3:%.*]] = tail call float @llvm.maximumnum.f32(float [[T3]], float [[M2]])
+; CHECK-NEXT: ret float [[M3]]
+;
+ %g1 = getelementptr inbounds float, ptr %p, i64 1
+ %g2 = getelementptr inbounds float, ptr %p, i64 2
+ %g3 = getelementptr inbounds float, ptr %p, i64 3
+ %t0 = load float, ptr %p, align 4
+ %t1 = load float, ptr %g1, align 4
+ %t2 = load float, ptr %g2, align 4
+ %t3 = load float, ptr %g3, align 4
+ %m1 = tail call float @llvm.maximumnum.f32(float %t1, float %t0)
+ %m2 = tail call float @llvm.maximumnum.f32(float %t2, float %m1)
+ %m3 = tail call float @llvm.maximumnum.f32(float %t3, float %m2)
+ ret float %m3
+}
+
+define float @reduction_v4f64_minimumnum(ptr %p) {
+; CHECK-LABEL: define float @reduction_v4f64_minimumnum
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[P]], i64 1
+; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
+; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
+; CHECK-NEXT: [[T0:%.*]] = load float, ptr [[P]], align 4
+; CHECK-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT: [[M1:%.*]] = tail call float @llvm.minimumnum.f32(float [[T1]], float [[T0]])
+; CHECK-NEXT: [[M2:%.*]] = tail call float @llvm.minimumnum.f32(float [[T2]], float [[M1]])
+; CHECK-NEXT: [[M3:%.*]] = tail call float @llvm.minimumnum.f32(float [[T3]], float [[M2]])
+; CHECK-NEXT: ret float [[M3]]
+;
+ %g1 = getelementptr inbounds float, ptr %p, i64 1
+ %g2 = getelementptr inbounds float, ptr %p, i64 2
+ %g3 = getelementptr inbounds float, ptr %p, i64 3
+ %t0 = load float, ptr %p, align 4
+ %t1 = load float, ptr %g1, align 4
+ %t2 = load float, ptr %g2, align 4
+ %t3 = load float, ptr %g3, align 4
+ %m1 = tail call float @llvm.minimumnum.f32(float %t1, float %t0)
+ %m2 = tail call float @llvm.minimumnum.f32(float %t2, float %m1)
+ %m3 = tail call float @llvm.minimumnum.f32(float %t3, float %m2)
+ ret float %m3
+}
More information about the llvm-commits
mailing list