[clang] [llvm] Vectorize: Support fminimumnum and fmaximumnum (PR #131781)

Tue Mar 18 03:55:06 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: YunQiang Su (wzssyqa)

<details>
<summary>Changes</summary>

Support auto-vectorize for fminimum_num and fmaximum_num. 
For ARM64 with SVE, scalable vector cannot support yet, and
For RISCV Vector, scalable vector works well now.

---

Patch is 33.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131781.diff


4 Files Affected:

- (added) clang/test/CodeGen/fminimum-num-autovec.c (+407) 
- (modified) llvm/include/llvm/CodeGen/BasicTTIImpl.h (+6) 
- (modified) llvm/lib/Analysis/VectorUtils.cpp (+2) 
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+11-2) 


``````````diff

diff --git a/clang/test/CodeGen/fminimum-num-autovec.c b/clang/test/CodeGen/fminimum-num-autovec.c
new file mode 100644
index 0000000000000..94114b6227d27
--- /dev/null
+++ b/clang/test/CodeGen/fminimum-num-autovec.c
@@ -0,0 +1,407 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang --target=aarch64-unknown-linux-gnu -march=armv8+fp16 %s -O3 -emit-llvm -S -o - | FileCheck %s --check-prefix=ARMV8
+// RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64gv_zvfh %s -O3 -emit-llvm -S -o - | FileCheck %s --check-prefix=RV64_ZVFH
+// FIXME: SVE cannot emit VSCALE.
+
+
+float af32[4096];
+float bf32[4096];
+float cf32[4096];
+// ARMV8-LABEL: define dso_local void @f32min(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA6:![0-9]+]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD11]], <4 x float> [[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
+// ARMV8-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP6]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP7]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f32min(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA9:![0-9]+]]
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.minimumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 4 x float> [[TMP4]], ptr [[TMP5]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+//
+void f32min() {
+	for (int i=0; i<4096; i++) {cf32[i] = __builtin_fminimum_numf(af32[i], bf32[i]);}
+}
+// ARMV8-LABEL: define dso_local void @f32max(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD11]], <4 x float> [[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
+// ARMV8-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP6]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP7]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f32max(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.maximumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 4 x float> [[TMP4]], ptr [[TMP5]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+//
+void f32max() {
+	for (int i=0; i<4096; i++) {cf32[i] = __builtin_fmaximum_numf(af32[i], bf32[i]);}
+}
+
+double af64[4096];
+double bf64[4096];
+double cf64[4096];
+// ARMV8-LABEL: define dso_local void @f64min(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA15:![0-9]+]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP1]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD11]], <2 x double> [[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
+// ARMV8-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP6]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f64min(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA18:![0-9]+]]
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.minimumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 2 x double> [[TMP4]], ptr [[TMP5]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+//
+void f64min() {
+	for (int i=0; i<4096; i++) {cf64[i] = __builtin_fminimum_num(af64[i], bf64[i]);}
+}
+// ARMV8-LABEL: define dso_local void @f64max(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP1]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD11]], <2 x double> [[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
+// ARMV8-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP6]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f64max(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.maximumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 2 x double> [[TMP4]], ptr [[TMP5]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT:    br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+// RV64_ZVFH:       [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT:    ret void
+//
+void f64max() {
+	for (int i=0; i<4096; i++) {cf64[i] = __builtin_fmaximum_num(af64[i], bf64[i]);}
+}
+
+__fp16 af16[4096];
+__fp16 bf16[4096];
+__fp16 cf16[4096];
+// ARMV8-LABEL: define dso_local void @f16min(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT:  [[ENTRY:.*]]:
+// ARMV8-NEXT:    br label %[[VECTOR_BODY:.*]]
+// ARMV8:       [[VECTOR_BODY]]:
+// ARMV8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2, !tbaa [[TBAA19:![0-9]+]]
+// ARMV8-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x half>, ptr [[TMP1]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
+// ARMV8-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x half>, ptr [[TMP2]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x half>, ptr [[TMP3]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[TMP4:%.*]] = tail call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD12]])
+// ARMV8-NEXT:    [[TMP5:%.*]] = tail call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD11]], <8 x half> [[WIDE_LOAD13]])
+// ARMV8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
+// ARMV8-NEXT:    store <8 x half> [[TMP4]], ptr [[TMP6]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    store <8 x half> [[TMP5]], ptr [[TMP7]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+// ARMV8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT:    br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+// ARMV8:       [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT:    ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f16min(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT:  [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ugt i64 [[TMP0]], 512
+// RV64_ZVFH-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
+// RV64_ZVFH:       [[VECTOR_PH]]:
+// RV64_ZVFH-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[DOTNEG:%.*]] = mul nuw nsw i64 [[TMP1]], 8184
+// RV64_ZVFH-NEXT:    [[N_VEC:%.*]] = and i64 [[DOTNEG]], 4096
+// RV64_ZVFH-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
+// RV64_ZVFH-NEXT:    br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH:       [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 2, !tbaa [[TBAA22:![0-9]+]]
+// RV64_ZVFH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x half>, ptr [[TMP5]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.minimumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT:    store <vscale x 8 x half> [[TMP6]], ptr [[TMP7]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+// RV64_ZVFH-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+// RV64_ZVFH-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+// RV64_ZVFH:       [[MIDDLE_BLOCK]]:
+// RV64_ZVFH-NEXT:    [[CMP_N_NOT:%.*]] = icmp eq i64 [[N_VEC]], 0
+// RV64_ZVFH-NEXT:    br i1 [[CMP_N_NOT]], label %[[FOR_BODY_PREHEADER]], label %[[F...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/131781