[llvm] 03505a0 - [RISCV] Enable scalable loop vectorization for fmax/fmin reductions with f16/bf16 type for zvfhmin/zvfbfmin (#129629)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 4 00:49:27 PST 2025
Author: Jim Lin
Date: 2025-03-04T16:49:24+08:00
New Revision: 03505a004ff6909c46d6b8c498a9ffccd47d88a0
URL: https://github.com/llvm/llvm-project/commit/03505a004ff6909c46d6b8c498a9ffccd47d88a0
DIFF: https://github.com/llvm/llvm-project/commit/03505a004ff6909c46d6b8c498a9ffccd47d88a0.diff
LOG: [RISCV] Enable scalable loop vectorization for fmax/fmin reductions with f16/bf16 type for zvfhmin/zvfbfmin (#129629)
This PR enable scalable loop vectorization for fmax and fmin reductions
with f16/bf16 type when only zvfhmin/zvfbfmin are enabled.
After https://github.com/llvm/llvm-project/pull/128800, we can promote
the fmax/fmin reductions with f16/bf16 type to f32 reductions for
zvfhmin/zvfbfmin.
Added:
Modified:
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 3f57560d3c127..020a2b8d4edfb 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -349,15 +349,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
if (!TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, Ty)))
return false;
- // We can't promote f16/bf16 fadd reductions and scalable vectors can't be
- // expanded.
- // TODO: Promote f16/bf16 fmin/fmax reductions
- if (Ty->isBFloatTy() || (Ty->isHalfTy() && !ST->hasVInstructionsF16()))
- return false;
-
switch (RdxDesc.getRecurrenceKind()) {
case RecurKind::Add:
- case RecurKind::FAdd:
case RecurKind::And:
case RecurKind::Or:
case RecurKind::Xor:
@@ -365,11 +358,17 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
case RecurKind::SMax:
case RecurKind::UMin:
case RecurKind::UMax:
+ case RecurKind::IAnyOf:
case RecurKind::FMin:
case RecurKind::FMax:
- case RecurKind::FMulAdd:
- case RecurKind::IAnyOf:
+ return true;
case RecurKind::FAnyOf:
+ case RecurKind::FAdd:
+ case RecurKind::FMulAdd:
+ // We can't promote f16/bf16 fadd reductions and scalable vectors can't be
+ // expanded.
+ if (Ty->isBFloatTy() || (Ty->isHalfTy() && !ST->hasVInstructionsF16()))
+ return false;
return true;
default:
return false;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
index 01a2a757dea5d..c3c2b8ee012a0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
@@ -344,6 +344,70 @@ for.end:
ret float %.sroa.speculated
}
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
+define half @fmin_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) #1 {
+; CHECK-LABEL: @fmin_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x half>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x half>
+; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x half> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x half> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x half> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x half> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x half> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x half> %[[SEL1]], <vscale x 8 x half> %[[SEL2]]
+; CHECK-NEXT: call half @llvm.vector.reduce.fmin.nxv8f16(<vscale x 8 x half> %[[SEL]])
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi half [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
+ %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv
+ %0 = load half, ptr %arrayidx, align 4
+ %cmp.i = fcmp olt half %0, %sum.07
+ %.sroa.speculated = select i1 %cmp.i, half %0, half %sum.07
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret half %.sroa.speculated
+}
+
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
+define bfloat @fmin_fast_bfloat_zvfbfmin(ptr noalias nocapture readonly %a, i64 %n) #2 {
+; CHECK-LABEL: @fmin_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x bfloat>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x bfloat>
+; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x bfloat> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x bfloat> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x bfloat> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x bfloat> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x bfloat> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x bfloat> %[[SEL1]], <vscale x 8 x bfloat> %[[SEL2]]
+; CHECK-NEXT: call bfloat @llvm.vector.reduce.fmin.nxv8bf16(<vscale x 8 x bfloat> %[[SEL]])
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
+ %arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv
+ %0 = load bfloat, ptr %arrayidx, align 4
+ %cmp.i = fcmp olt bfloat %0, %sum.07
+ %.sroa.speculated = select i1 %cmp.i, bfloat %0, bfloat %sum.07
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret bfloat %.sroa.speculated
+}
+
; FMAX (FAST)
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
@@ -378,6 +442,70 @@ for.end:
ret float %.sroa.speculated
}
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
+define half @fmax_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) #1 {
+; CHECK-LABEL: @fmax_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x half>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x half>
+; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x half> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x half> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x half> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x half> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x half> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x half> %[[SEL1]], <vscale x 8 x half> %[[SEL2]]
+; CHECK-NEXT: call fast half @llvm.vector.reduce.fmax.nxv8f16(<vscale x 8 x half> %[[SEL]])
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi half [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
+ %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv
+ %0 = load half, ptr %arrayidx, align 4
+ %cmp.i = fcmp fast ogt half %0, %sum.07
+ %.sroa.speculated = select i1 %cmp.i, half %0, half %sum.07
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret half %.sroa.speculated
+}
+
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
+define bfloat @fmax_fast_bfloat_zvfbfmin(ptr noalias nocapture readonly %a, i64 %n) #2 {
+; CHECK-LABEL: @fmax_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x bfloat>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x bfloat>
+; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x bfloat> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x bfloat> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x bfloat> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x bfloat> %[[SEL1]], <vscale x 8 x bfloat> %[[SEL2]]
+; CHECK-NEXT: call fast bfloat @llvm.vector.reduce.fmax.nxv8bf16(<vscale x 8 x bfloat> %[[SEL]])
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
+ %arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv
+ %0 = load bfloat, ptr %arrayidx, align 4
+ %cmp.i = fcmp fast ogt bfloat %0, %sum.07
+ %.sroa.speculated = select i1 %cmp.i, bfloat %0, bfloat %sum.07
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret bfloat %.sroa.speculated
+}
+
; Reduction cannot be vectorized
; MUL
@@ -591,6 +719,8 @@ for.end:
declare float @llvm.fmuladd.f32(float, float, float)
attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "target-features"="+zfhmin,+zvfhmin"}
+attributes #2 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "target-features"="+zfbfmin,+zvfbfmin"}
!0 = distinct !{!0, !1, !2, !3, !4}
!1 = !{!"llvm.loop.vectorize.width", i32 8}
More information about the llvm-commits
mailing list