[clang] [llvm] Vectorize: Support fminimumnum and fmaximumnum (PR #131781)
YunQiang Su via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 31 01:22:47 PDT 2025
https://github.com/wzssyqa updated https://github.com/llvm/llvm-project/pull/131781
>From fd57d662effa70dc6610d318ed770be127a2fa9d Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqiang at isrc.iscas.ac.cn>
Date: Tue, 18 Mar 2025 18:46:29 +0800
Subject: [PATCH 1/2] Vectorize: Support fminimumnum and fmaximumnum
Support auto-vectorize for fminimum_num and fmaximum_num.
For ARM64 with SVE, scalable vector cannot support yet, and
For RISCV Vector, scalable vector works well now.
---
clang/test/CodeGen/fminimum-num-autovec.c | 407 ++++++++++++++++++++
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 6 +
llvm/lib/Analysis/VectorUtils.cpp | 2 +
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 13 +-
4 files changed, 426 insertions(+), 2 deletions(-)
create mode 100644 clang/test/CodeGen/fminimum-num-autovec.c
diff --git a/clang/test/CodeGen/fminimum-num-autovec.c b/clang/test/CodeGen/fminimum-num-autovec.c
new file mode 100644
index 0000000000000..94114b6227d27
--- /dev/null
+++ b/clang/test/CodeGen/fminimum-num-autovec.c
@@ -0,0 +1,407 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang --target=aarch64-unknown-linux-gnu -march=armv8+fp16 %s -O3 -emit-llvm -S -o - | FileCheck %s --check-prefix=ARMV8
+// RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64gv_zvfh %s -O3 -emit-llvm -S -o - | FileCheck %s --check-prefix=RV64_ZVFH
+// FIXME: SVE cannot emit VSCALE.
+
+
+float af32[4096];
+float bf32[4096];
+float cf32[4096];
+// ARMV8-LABEL: define dso_local void @f32min(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// ARMV8-NEXT: [[ENTRY:.*]]:
+// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]]
+// ARMV8: [[VECTOR_BODY]]:
+// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA6:![0-9]+]]
+// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
+// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT: [[TMP4:%.*]] = tail call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+// ARMV8-NEXT: [[TMP5:%.*]] = tail call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD11]], <4 x float> [[WIDE_LOAD13]])
+// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
+// ARMV8-NEXT: store <4 x float> [[TMP4]], ptr [[TMP6]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT: store <4 x float> [[TMP5]], ptr [[TMP7]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+// ARMV8: [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT: ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f32min(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// RV64_ZVFH-NEXT: [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH: [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA9:![0-9]+]]
+// RV64_ZVFH-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.minimumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: store <vscale x 4 x float> [[TMP4]], ptr [[TMP5]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+// RV64_ZVFH: [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT: ret void
+//
+void f32min() {
+ for (int i=0; i<4096; i++) {cf32[i] = __builtin_fminimum_numf(af32[i], bf32[i]);}
+}
+// ARMV8-LABEL: define dso_local void @f32max(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT: [[ENTRY:.*]]:
+// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]]
+// ARMV8: [[VECTOR_BODY]]:
+// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
+// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT: [[TMP4:%.*]] = tail call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
+// ARMV8-NEXT: [[TMP5:%.*]] = tail call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD11]], <4 x float> [[WIDE_LOAD13]])
+// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
+// ARMV8-NEXT: store <4 x float> [[TMP4]], ptr [[TMP6]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT: store <4 x float> [[TMP5]], ptr [[TMP7]], align 4, !tbaa [[TBAA6]]
+// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+// ARMV8: [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT: ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f32max(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT: [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH: [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.maximumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: store <vscale x 4 x float> [[TMP4]], ptr [[TMP5]], align 4, !tbaa [[TBAA9]]
+// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+// RV64_ZVFH: [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT: ret void
+//
+void f32max() {
+ for (int i=0; i<4096; i++) {cf32[i] = __builtin_fmaximum_numf(af32[i], bf32[i]);}
+}
+
+double af64[4096];
+double bf64[4096];
+double cf64[4096];
+// ARMV8-LABEL: define dso_local void @f64min(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT: [[ENTRY:.*]]:
+// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]]
+// ARMV8: [[VECTOR_BODY]]:
+// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA15:![0-9]+]]
+// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP1]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
+// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT: [[TMP4:%.*]] = tail call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD12]])
+// ARMV8-NEXT: [[TMP5:%.*]] = tail call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD11]], <2 x double> [[WIDE_LOAD13]])
+// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
+// ARMV8-NEXT: store <2 x double> [[TMP4]], ptr [[TMP6]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+// ARMV8: [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT: ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f64min(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT: [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH: [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA18:![0-9]+]]
+// RV64_ZVFH-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.minimumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: store <vscale x 2 x double> [[TMP4]], ptr [[TMP5]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+// RV64_ZVFH: [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT: ret void
+//
+void f64min() {
+ for (int i=0; i<4096; i++) {cf64[i] = __builtin_fminimum_num(af64[i], bf64[i]);}
+}
+// ARMV8-LABEL: define dso_local void @f64max(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT: [[ENTRY:.*]]:
+// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]]
+// ARMV8: [[VECTOR_BODY]]:
+// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP1]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
+// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT: [[TMP4:%.*]] = tail call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD12]])
+// ARMV8-NEXT: [[TMP5:%.*]] = tail call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD11]], <2 x double> [[WIDE_LOAD13]])
+// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
+// ARMV8-NEXT: store <2 x double> [[TMP4]], ptr [[TMP6]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 8, !tbaa [[TBAA15]]
+// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+// ARMV8: [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT: ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f64max(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT: [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH: [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.maximumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: store <vscale x 2 x double> [[TMP4]], ptr [[TMP5]], align 8, !tbaa [[TBAA18]]
+// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+// RV64_ZVFH-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// RV64_ZVFH-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+// RV64_ZVFH: [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT: ret void
+//
+void f64max() {
+ for (int i=0; i<4096; i++) {cf64[i] = __builtin_fmaximum_num(af64[i], bf64[i]);}
+}
+
+__fp16 af16[4096];
+__fp16 bf16[4096];
+__fp16 cf16[4096];
+// ARMV8-LABEL: define dso_local void @f16min(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT: [[ENTRY:.*]]:
+// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]]
+// ARMV8: [[VECTOR_BODY]]:
+// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2, !tbaa [[TBAA19:![0-9]+]]
+// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x half>, ptr [[TMP1]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
+// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x half>, ptr [[TMP2]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x half>, ptr [[TMP3]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT: [[TMP4:%.*]] = tail call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD12]])
+// ARMV8-NEXT: [[TMP5:%.*]] = tail call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD11]], <8 x half> [[WIDE_LOAD13]])
+// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
+// ARMV8-NEXT: store <8 x half> [[TMP4]], ptr [[TMP6]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT: store <8 x half> [[TMP5]], ptr [[TMP7]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+// ARMV8: [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT: ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f16min(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT: [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ugt i64 [[TMP0]], 512
+// RV64_ZVFH-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
+// RV64_ZVFH: [[VECTOR_PH]]:
+// RV64_ZVFH-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT: [[DOTNEG:%.*]] = mul nuw nsw i64 [[TMP1]], 8184
+// RV64_ZVFH-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], 4096
+// RV64_ZVFH-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
+// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH: [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 2, !tbaa [[TBAA22:![0-9]+]]
+// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 8 x half>, ptr [[TMP5]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.minimumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: store <vscale x 8 x half> [[TMP6]], ptr [[TMP7]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+// RV64_ZVFH-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+// RV64_ZVFH-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+// RV64_ZVFH: [[MIDDLE_BLOCK]]:
+// RV64_ZVFH-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i64 [[N_VEC]], 0
+// RV64_ZVFH-NEXT: br i1 [[CMP_N_NOT]], label %[[FOR_BODY_PREHEADER]], label %[[FOR_COND_CLEANUP:.*]]
+// RV64_ZVFH: [[FOR_BODY_PREHEADER]]:
+// RV64_ZVFH-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
+// RV64_ZVFH-NEXT: br label %[[FOR_BODY:.*]]
+// RV64_ZVFH: [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT: ret void
+// RV64_ZVFH: [[FOR_BODY]]:
+// RV64_ZVFH-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER]] ]
+// RV64_ZVFH-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDVARS_IV]]
+// RV64_ZVFH-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDVARS_IV]]
+// RV64_ZVFH-NEXT: [[TMP10:%.*]] = load half, ptr [[ARRAYIDX2]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT: [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
+// RV64_ZVFH-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDVARS_IV]]
+// RV64_ZVFH-NEXT: store half [[TMP11]], ptr [[ARRAYIDX4]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+// RV64_ZVFH-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+// RV64_ZVFH-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+//
+void f16min() {
+ for (int i=0; i<4096; i++) {cf16[i] = __builtin_fminimum_numf16(af16[i], bf16[i]);}
+}
+// ARMV8-LABEL: define dso_local void @f16max(
+// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// ARMV8-NEXT: [[ENTRY:.*]]:
+// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]]
+// ARMV8: [[VECTOR_BODY]]:
+// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x half>, ptr [[TMP1]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
+// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x half>, ptr [[TMP2]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x half>, ptr [[TMP3]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT: [[TMP4:%.*]] = tail call <8 x half> @llvm.maximumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD12]])
+// ARMV8-NEXT: [[TMP5:%.*]] = tail call <8 x half> @llvm.maximumnum.v8f16(<8 x half> [[WIDE_LOAD11]], <8 x half> [[WIDE_LOAD13]])
+// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
+// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
+// ARMV8-NEXT: store <8 x half> [[TMP4]], ptr [[TMP6]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT: store <8 x half> [[TMP5]], ptr [[TMP7]], align 2, !tbaa [[TBAA19]]
+// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+// ARMV8: [[FOR_COND_CLEANUP]]:
+// ARMV8-NEXT: ret void
+//
+// RV64_ZVFH-LABEL: define dso_local void @f16max(
+// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// RV64_ZVFH-NEXT: [[ENTRY:.*]]:
+// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ugt i64 [[TMP0]], 512
+// RV64_ZVFH-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
+// RV64_ZVFH: [[VECTOR_PH]]:
+// RV64_ZVFH-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT: [[DOTNEG:%.*]] = mul nuw nsw i64 [[TMP1]], 8184
+// RV64_ZVFH-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], 4096
+// RV64_ZVFH-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// RV64_ZVFH-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
+// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]]
+// RV64_ZVFH: [[VECTOR_BODY]]:
+// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+// RV64_ZVFH-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 8 x half>, ptr [[TMP5]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.maximumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD10]])
+// RV64_ZVFH-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
+// RV64_ZVFH-NEXT: store <vscale x 8 x half> [[TMP6]], ptr [[TMP7]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+// RV64_ZVFH-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+// RV64_ZVFH-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+// RV64_ZVFH: [[MIDDLE_BLOCK]]:
+// RV64_ZVFH-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i64 [[N_VEC]], 0
+// RV64_ZVFH-NEXT: br i1 [[CMP_N_NOT]], label %[[FOR_BODY_PREHEADER]], label %[[FOR_COND_CLEANUP:.*]]
+// RV64_ZVFH: [[FOR_BODY_PREHEADER]]:
+// RV64_ZVFH-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
+// RV64_ZVFH-NEXT: br label %[[FOR_BODY:.*]]
+// RV64_ZVFH: [[FOR_COND_CLEANUP]]:
+// RV64_ZVFH-NEXT: ret void
+// RV64_ZVFH: [[FOR_BODY]]:
+// RV64_ZVFH-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER]] ]
+// RV64_ZVFH-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDVARS_IV]]
+// RV64_ZVFH-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDVARS_IV]]
+// RV64_ZVFH-NEXT: [[TMP10:%.*]] = load half, ptr [[ARRAYIDX2]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT: [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
+// RV64_ZVFH-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDVARS_IV]]
+// RV64_ZVFH-NEXT: store half [[TMP11]], ptr [[ARRAYIDX4]], align 2, !tbaa [[TBAA22]]
+// RV64_ZVFH-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+// RV64_ZVFH-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+// RV64_ZVFH-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+//
+void f16max() {
+ for (int i=0; i<4096; i++) {cf16[i] = __builtin_fmaximum_numf16(af16[i], bf16[i]);}
+}
+
+//.
+// ARMV8: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// ARMV8: [[META7]] = !{!"float", [[META8:![0-9]+]], i64 0}
+// ARMV8: [[META8]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
+// ARMV8: [[META9]] = !{!"Simple C/C++ TBAA"}
+// ARMV8: [[LOOP10]] = distinct !{[[LOOP10]], [[META11:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]]}
+// ARMV8: [[META11]] = !{!"llvm.loop.mustprogress"}
+// ARMV8: [[META12]] = !{!"llvm.loop.isvectorized", i32 1}
+// ARMV8: [[META13]] = !{!"llvm.loop.unroll.runtime.disable"}
+// ARMV8: [[LOOP14]] = distinct !{[[LOOP14]], [[META11]], [[META12]], [[META13]]}
+// ARMV8: [[TBAA15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
+// ARMV8: [[META16]] = !{!"double", [[META8]], i64 0}
+// ARMV8: [[LOOP17]] = distinct !{[[LOOP17]], [[META11]], [[META12]], [[META13]]}
+// ARMV8: [[LOOP18]] = distinct !{[[LOOP18]], [[META11]], [[META12]], [[META13]]}
+// ARMV8: [[TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
+// ARMV8: [[META20]] = !{!"__fp16", [[META8]], i64 0}
+// ARMV8: [[LOOP21]] = distinct !{[[LOOP21]], [[META11]], [[META12]], [[META13]]}
+// ARMV8: [[LOOP22]] = distinct !{[[LOOP22]], [[META11]], [[META12]], [[META13]]}
+//.
+// RV64_ZVFH: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// RV64_ZVFH: [[META10]] = !{!"float", [[META11:![0-9]+]], i64 0}
+// RV64_ZVFH: [[META11]] = !{!"omnipotent char", [[META12:![0-9]+]], i64 0}
+// RV64_ZVFH: [[META12]] = !{!"Simple C/C++ TBAA"}
+// RV64_ZVFH: [[LOOP13]] = distinct !{[[LOOP13]], [[META14:![0-9]+]], [[META15:![0-9]+]], [[META16:![0-9]+]]}
+// RV64_ZVFH: [[META14]] = !{!"llvm.loop.mustprogress"}
+// RV64_ZVFH: [[META15]] = !{!"llvm.loop.isvectorized", i32 1}
+// RV64_ZVFH: [[META16]] = !{!"llvm.loop.unroll.runtime.disable"}
+// RV64_ZVFH: [[LOOP17]] = distinct !{[[LOOP17]], [[META14]], [[META15]], [[META16]]}
+// RV64_ZVFH: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// RV64_ZVFH: [[META19]] = !{!"double", [[META11]], i64 0}
+// RV64_ZVFH: [[LOOP20]] = distinct !{[[LOOP20]], [[META14]], [[META15]], [[META16]]}
+// RV64_ZVFH: [[LOOP21]] = distinct !{[[LOOP21]], [[META14]], [[META15]], [[META16]]}
+// RV64_ZVFH: [[TBAA22]] = !{[[META23:![0-9]+]], [[META23]], i64 0}
+// RV64_ZVFH: [[META23]] = !{!"__fp16", [[META11]], i64 0}
+// RV64_ZVFH: [[LOOP24]] = distinct !{[[LOOP24]], [[META14]], [[META15]], [[META16]]}
+// RV64_ZVFH: [[LOOP25]] = distinct !{[[LOOP25]], [[META14]], [[META16]], [[META15]]}
+// RV64_ZVFH: [[LOOP26]] = distinct !{[[LOOP26]], [[META14]], [[META15]], [[META16]]}
+// RV64_ZVFH: [[LOOP27]] = distinct !{[[LOOP27]], [[META14]], [[META16]], [[META15]]}
+//.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index eacf75c24695f..1d06a659b38cf 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2776,6 +2776,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
return Cost;
}
+ case Intrinsic::maximumnum:
+ case Intrinsic::minimumnum: {
+ if (TLI->isOperationLegalOrPromote(llvm::ISD::FMAXNUM_IEEE, LT.second))
+ return LT.first * 3;
+ break;
+ }
default:
break;
}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 91ba68fe03324..422058be22edb 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -89,6 +89,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::maxnum:
case Intrinsic::minimum:
case Intrinsic::maximum:
+ case Intrinsic::minimumnum:
+ case Intrinsic::maximumnum:
case Intrinsic::modf:
case Intrinsic::copysign:
case Intrinsic::floor:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 70ec57798db71..ce7918da11f74 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -962,6 +962,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
static const unsigned ZvfhminZvfbfminPromoteOps[] = {
ISD::FMINNUM,
ISD::FMAXNUM,
+ ISD::FMINIMUMNUM,
+ ISD::FMAXIMUMNUM,
ISD::FADD,
ISD::FSUB,
ISD::FMUL,
@@ -1030,7 +1032,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
// Expand various condition codes (explained above).
setCondCodeAction(VFPCCToExpand, VT, Expand);
- setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, VT, Legal);
+ setOperationAction(
+ {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMAXIMUMNUM, ISD::FMINIMUMNUM}, VT,
+ Legal);
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, VT, Custom);
setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND,
@@ -1445,7 +1449,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
ISD::FNEG, ISD::FABS, ISD::FCOPYSIGN, ISD::FSQRT,
ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
- ISD::IS_FPCLASS, ISD::FMAXIMUM, ISD::FMINIMUM},
+ ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM, ISD::IS_FPCLASS,
+ ISD::FMAXIMUM, ISD::FMINIMUM},
VT, Custom);
setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND,
@@ -6804,9 +6809,11 @@ static unsigned getRISCVVLOp(SDValue Op) {
case ISD::VP_FP_TO_UINT:
return RISCVISD::VFCVT_RTZ_XU_F_VL;
case ISD::FMINNUM:
+ case ISD::FMINIMUMNUM:
case ISD::VP_FMINNUM:
return RISCVISD::VFMIN_VL;
case ISD::FMAXNUM:
+ case ISD::FMAXIMUMNUM:
case ISD::VP_FMAXNUM:
return RISCVISD::VFMAX_VL;
case ISD::LRINT:
@@ -7837,6 +7844,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::FMA:
case ISD::FMINNUM:
case ISD::FMAXNUM:
+ case ISD::FMINIMUMNUM:
+ case ISD::FMAXIMUMNUM:
if (isPromotedOpNeedingSplit(Op, Subtarget))
return SplitVectorOp(Op, DAG);
[[fallthrough]];
>From f3887cfac9df0e527272d51ae4cff18d5924f479 Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqiang at isrc.iscas.ac.cn>
Date: Mon, 31 Mar 2025 08:20:32 +0000
Subject: [PATCH 2/2] use opt for testcase instead of clang
---
clang/test/CodeGen/fminimum-num-autovec.c | 407 -------
.../Transforms/LoopVectorize/fminimumnum.ll | 1059 +++++++++++++++++
2 files changed, 1059 insertions(+), 407 deletions(-)
delete mode 100644 clang/test/CodeGen/fminimum-num-autovec.c
create mode 100644 llvm/test/Transforms/LoopVectorize/fminimumnum.ll
diff --git a/clang/test/CodeGen/fminimum-num-autovec.c b/clang/test/CodeGen/fminimum-num-autovec.c
deleted file mode 100644
index 94114b6227d27..0000000000000
--- a/clang/test/CodeGen/fminimum-num-autovec.c
+++ /dev/null
@@ -1,407 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang --target=aarch64-unknown-linux-gnu -march=armv8+fp16 %s -O3 -emit-llvm -S -o - | FileCheck %s --check-prefix=ARMV8
-// RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64gv_zvfh %s -O3 -emit-llvm -S -o - | FileCheck %s --check-prefix=RV64_ZVFH
-// FIXME: SVE cannot emit VSCALE.
-
-
-float af32[4096];
-float bf32[4096];
-float cf32[4096];
-// ARMV8-LABEL: define dso_local void @f32min(
-// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// ARMV8-NEXT: [[ENTRY:.*]]:
-// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]]
-// ARMV8: [[VECTOR_BODY]]:
-// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
-// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA6:![0-9]+]]
-// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA6]]
-// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
-// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA6]]
-// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA6]]
-// ARMV8-NEXT: [[TMP4:%.*]] = tail call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
-// ARMV8-NEXT: [[TMP5:%.*]] = tail call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD11]], <4 x float> [[WIDE_LOAD13]])
-// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
-// ARMV8-NEXT: store <4 x float> [[TMP4]], ptr [[TMP6]], align 4, !tbaa [[TBAA6]]
-// ARMV8-NEXT: store <4 x float> [[TMP5]], ptr [[TMP7]], align 4, !tbaa [[TBAA6]]
-// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-// ARMV8: [[FOR_COND_CLEANUP]]:
-// ARMV8-NEXT: ret void
-//
-// RV64_ZVFH-LABEL: define dso_local void @f32min(
-// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// RV64_ZVFH-NEXT: [[ENTRY:.*]]:
-// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// RV64_ZVFH-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]]
-// RV64_ZVFH: [[VECTOR_BODY]]:
-// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-// RV64_ZVFH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA9:![0-9]+]]
-// RV64_ZVFH-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA9]]
-// RV64_ZVFH-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.minimumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD10]])
-// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: store <vscale x 4 x float> [[TMP4]], ptr [[TMP5]], align 4, !tbaa [[TBAA9]]
-// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-// RV64_ZVFH-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-// RV64_ZVFH-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-// RV64_ZVFH: [[FOR_COND_CLEANUP]]:
-// RV64_ZVFH-NEXT: ret void
-//
-void f32min() {
- for (int i=0; i<4096; i++) {cf32[i] = __builtin_fminimum_numf(af32[i], bf32[i]);}
-}
-// ARMV8-LABEL: define dso_local void @f32max(
-// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
-// ARMV8-NEXT: [[ENTRY:.*]]:
-// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]]
-// ARMV8: [[VECTOR_BODY]]:
-// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
-// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA6]]
-// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA6]]
-// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
-// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA6]]
-// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA6]]
-// ARMV8-NEXT: [[TMP4:%.*]] = tail call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
-// ARMV8-NEXT: [[TMP5:%.*]] = tail call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD11]], <4 x float> [[WIDE_LOAD13]])
-// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
-// ARMV8-NEXT: store <4 x float> [[TMP4]], ptr [[TMP6]], align 4, !tbaa [[TBAA6]]
-// ARMV8-NEXT: store <4 x float> [[TMP5]], ptr [[TMP7]], align 4, !tbaa [[TBAA6]]
-// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-// ARMV8: [[FOR_COND_CLEANUP]]:
-// ARMV8-NEXT: ret void
-//
-// RV64_ZVFH-LABEL: define dso_local void @f32max(
-// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
-// RV64_ZVFH-NEXT: [[ENTRY:.*]]:
-// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// RV64_ZVFH-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]]
-// RV64_ZVFH: [[VECTOR_BODY]]:
-// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-// RV64_ZVFH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP2]], align 4, !tbaa [[TBAA9]]
-// RV64_ZVFH-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4, !tbaa [[TBAA9]]
-// RV64_ZVFH-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.maximumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD10]])
-// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: store <vscale x 4 x float> [[TMP4]], ptr [[TMP5]], align 4, !tbaa [[TBAA9]]
-// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-// RV64_ZVFH-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-// RV64_ZVFH-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-// RV64_ZVFH: [[FOR_COND_CLEANUP]]:
-// RV64_ZVFH-NEXT: ret void
-//
-void f32max() {
- for (int i=0; i<4096; i++) {cf32[i] = __builtin_fmaximum_numf(af32[i], bf32[i]);}
-}
-
-double af64[4096];
-double bf64[4096];
-double cf64[4096];
-// ARMV8-LABEL: define dso_local void @f64min(
-// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
-// ARMV8-NEXT: [[ENTRY:.*]]:
-// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]]
-// ARMV8: [[VECTOR_BODY]]:
-// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
-// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA15:![0-9]+]]
-// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP1]], align 8, !tbaa [[TBAA15]]
-// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
-// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA15]]
-// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA15]]
-// ARMV8-NEXT: [[TMP4:%.*]] = tail call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD12]])
-// ARMV8-NEXT: [[TMP5:%.*]] = tail call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD11]], <2 x double> [[WIDE_LOAD13]])
-// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
-// ARMV8-NEXT: store <2 x double> [[TMP4]], ptr [[TMP6]], align 8, !tbaa [[TBAA15]]
-// ARMV8-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 8, !tbaa [[TBAA15]]
-// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-// ARMV8: [[FOR_COND_CLEANUP]]:
-// ARMV8-NEXT: ret void
-//
-// RV64_ZVFH-LABEL: define dso_local void @f64min(
-// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
-// RV64_ZVFH-NEXT: [[ENTRY:.*]]:
-// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// RV64_ZVFH-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
-// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]]
-// RV64_ZVFH: [[VECTOR_BODY]]:
-// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-// RV64_ZVFH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA18:![0-9]+]]
-// RV64_ZVFH-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA18]]
-// RV64_ZVFH-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.minimumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD10]])
-// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: store <vscale x 2 x double> [[TMP4]], ptr [[TMP5]], align 8, !tbaa [[TBAA18]]
-// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-// RV64_ZVFH-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-// RV64_ZVFH-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-// RV64_ZVFH: [[FOR_COND_CLEANUP]]:
-// RV64_ZVFH-NEXT: ret void
-//
-void f64min() {
- for (int i=0; i<4096; i++) {cf64[i] = __builtin_fminimum_num(af64[i], bf64[i]);}
-}
-// ARMV8-LABEL: define dso_local void @f64max(
-// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
-// ARMV8-NEXT: [[ENTRY:.*]]:
-// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]]
-// ARMV8: [[VECTOR_BODY]]:
-// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
-// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA15]]
-// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP1]], align 8, !tbaa [[TBAA15]]
-// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
-// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA15]]
-// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA15]]
-// ARMV8-NEXT: [[TMP4:%.*]] = tail call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD12]])
-// ARMV8-NEXT: [[TMP5:%.*]] = tail call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD11]], <2 x double> [[WIDE_LOAD13]])
-// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
-// ARMV8-NEXT: store <2 x double> [[TMP4]], ptr [[TMP6]], align 8, !tbaa [[TBAA15]]
-// ARMV8-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 8, !tbaa [[TBAA15]]
-// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-// ARMV8: [[FOR_COND_CLEANUP]]:
-// ARMV8-NEXT: ret void
-//
-// RV64_ZVFH-LABEL: define dso_local void @f64max(
-// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
-// RV64_ZVFH-NEXT: [[ENTRY:.*]]:
-// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// RV64_ZVFH-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
-// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]]
-// RV64_ZVFH: [[VECTOR_BODY]]:
-// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-// RV64_ZVFH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA18]]
-// RV64_ZVFH-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 2 x double>, ptr [[TMP3]], align 8, !tbaa [[TBAA18]]
-// RV64_ZVFH-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.maximumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD10]])
-// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: store <vscale x 2 x double> [[TMP4]], ptr [[TMP5]], align 8, !tbaa [[TBAA18]]
-// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-// RV64_ZVFH-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-// RV64_ZVFH-NEXT: br i1 [[TMP6]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-// RV64_ZVFH: [[FOR_COND_CLEANUP]]:
-// RV64_ZVFH-NEXT: ret void
-//
-void f64max() {
- for (int i=0; i<4096; i++) {cf64[i] = __builtin_fmaximum_num(af64[i], bf64[i]);}
-}
-
-__fp16 af16[4096];
-__fp16 bf16[4096];
-__fp16 cf16[4096];
-// ARMV8-LABEL: define dso_local void @f16min(
-// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
-// ARMV8-NEXT: [[ENTRY:.*]]:
-// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]]
-// ARMV8: [[VECTOR_BODY]]:
-// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
-// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2, !tbaa [[TBAA19:![0-9]+]]
-// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x half>, ptr [[TMP1]], align 2, !tbaa [[TBAA19]]
-// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
-// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x half>, ptr [[TMP2]], align 2, !tbaa [[TBAA19]]
-// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x half>, ptr [[TMP3]], align 2, !tbaa [[TBAA19]]
-// ARMV8-NEXT: [[TMP4:%.*]] = tail call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD12]])
-// ARMV8-NEXT: [[TMP5:%.*]] = tail call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD11]], <8 x half> [[WIDE_LOAD13]])
-// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
-// ARMV8-NEXT: store <8 x half> [[TMP4]], ptr [[TMP6]], align 2, !tbaa [[TBAA19]]
-// ARMV8-NEXT: store <8 x half> [[TMP5]], ptr [[TMP7]], align 2, !tbaa [[TBAA19]]
-// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-// ARMV8: [[FOR_COND_CLEANUP]]:
-// ARMV8-NEXT: ret void
-//
-// RV64_ZVFH-LABEL: define dso_local void @f16min(
-// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
-// RV64_ZVFH-NEXT: [[ENTRY:.*]]:
-// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// RV64_ZVFH-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ugt i64 [[TMP0]], 512
-// RV64_ZVFH-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
-// RV64_ZVFH: [[VECTOR_PH]]:
-// RV64_ZVFH-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// RV64_ZVFH-NEXT: [[DOTNEG:%.*]] = mul nuw nsw i64 [[TMP1]], 8184
-// RV64_ZVFH-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], 4096
-// RV64_ZVFH-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
-// RV64_ZVFH-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
-// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]]
-// RV64_ZVFH: [[VECTOR_BODY]]:
-// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-// RV64_ZVFH-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 2, !tbaa [[TBAA22:![0-9]+]]
-// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 8 x half>, ptr [[TMP5]], align 2, !tbaa [[TBAA22]]
-// RV64_ZVFH-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.minimumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD10]])
-// RV64_ZVFH-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: store <vscale x 8 x half> [[TMP6]], ptr [[TMP7]], align 2, !tbaa [[TBAA22]]
-// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-// RV64_ZVFH-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-// RV64_ZVFH-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-// RV64_ZVFH: [[MIDDLE_BLOCK]]:
-// RV64_ZVFH-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i64 [[N_VEC]], 0
-// RV64_ZVFH-NEXT: br i1 [[CMP_N_NOT]], label %[[FOR_BODY_PREHEADER]], label %[[FOR_COND_CLEANUP:.*]]
-// RV64_ZVFH: [[FOR_BODY_PREHEADER]]:
-// RV64_ZVFH-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
-// RV64_ZVFH-NEXT: br label %[[FOR_BODY:.*]]
-// RV64_ZVFH: [[FOR_COND_CLEANUP]]:
-// RV64_ZVFH-NEXT: ret void
-// RV64_ZVFH: [[FOR_BODY]]:
-// RV64_ZVFH-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER]] ]
-// RV64_ZVFH-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDVARS_IV]]
-// RV64_ZVFH-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX]], align 2, !tbaa [[TBAA22]]
-// RV64_ZVFH-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDVARS_IV]]
-// RV64_ZVFH-NEXT: [[TMP10:%.*]] = load half, ptr [[ARRAYIDX2]], align 2, !tbaa [[TBAA22]]
-// RV64_ZVFH-NEXT: [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
-// RV64_ZVFH-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDVARS_IV]]
-// RV64_ZVFH-NEXT: store half [[TMP11]], ptr [[ARRAYIDX4]], align 2, !tbaa [[TBAA22]]
-// RV64_ZVFH-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-// RV64_ZVFH-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-// RV64_ZVFH-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-//
-void f16min() {
- for (int i=0; i<4096; i++) {cf16[i] = __builtin_fminimum_numf16(af16[i], bf16[i]);}
-}
-// ARMV8-LABEL: define dso_local void @f16max(
-// ARMV8-SAME: ) local_unnamed_addr #[[ATTR0]] {
-// ARMV8-NEXT: [[ENTRY:.*]]:
-// ARMV8-NEXT: br label %[[VECTOR_BODY:.*]]
-// ARMV8: [[VECTOR_BODY]]:
-// ARMV8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-// ARMV8-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
-// ARMV8-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2, !tbaa [[TBAA19]]
-// ARMV8-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x half>, ptr [[TMP1]], align 2, !tbaa [[TBAA19]]
-// ARMV8-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 16
-// ARMV8-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x half>, ptr [[TMP2]], align 2, !tbaa [[TBAA19]]
-// ARMV8-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x half>, ptr [[TMP3]], align 2, !tbaa [[TBAA19]]
-// ARMV8-NEXT: [[TMP4:%.*]] = tail call <8 x half> @llvm.maximumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD12]])
-// ARMV8-NEXT: [[TMP5:%.*]] = tail call <8 x half> @llvm.maximumnum.v8f16(<8 x half> [[WIDE_LOAD11]], <8 x half> [[WIDE_LOAD13]])
-// ARMV8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
-// ARMV8-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
-// ARMV8-NEXT: store <8 x half> [[TMP4]], ptr [[TMP6]], align 2, !tbaa [[TBAA19]]
-// ARMV8-NEXT: store <8 x half> [[TMP5]], ptr [[TMP7]], align 2, !tbaa [[TBAA19]]
-// ARMV8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-// ARMV8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-// ARMV8-NEXT: br i1 [[TMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-// ARMV8: [[FOR_COND_CLEANUP]]:
-// ARMV8-NEXT: ret void
-//
-// RV64_ZVFH-LABEL: define dso_local void @f16max(
-// RV64_ZVFH-SAME: ) local_unnamed_addr #[[ATTR0]] {
-// RV64_ZVFH-NEXT: [[ENTRY:.*]]:
-// RV64_ZVFH-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
-// RV64_ZVFH-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ugt i64 [[TMP0]], 512
-// RV64_ZVFH-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
-// RV64_ZVFH: [[VECTOR_PH]]:
-// RV64_ZVFH-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
-// RV64_ZVFH-NEXT: [[DOTNEG:%.*]] = mul nuw nsw i64 [[TMP1]], 8184
-// RV64_ZVFH-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], 4096
-// RV64_ZVFH-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
-// RV64_ZVFH-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
-// RV64_ZVFH-NEXT: br label %[[VECTOR_BODY:.*]]
-// RV64_ZVFH: [[VECTOR_BODY]]:
-// RV64_ZVFH-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-// RV64_ZVFH-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 2, !tbaa [[TBAA22]]
-// RV64_ZVFH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 8 x half>, ptr [[TMP5]], align 2, !tbaa [[TBAA22]]
-// RV64_ZVFH-NEXT: [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.maximumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD10]])
-// RV64_ZVFH-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
-// RV64_ZVFH-NEXT: store <vscale x 8 x half> [[TMP6]], ptr [[TMP7]], align 2, !tbaa [[TBAA22]]
-// RV64_ZVFH-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-// RV64_ZVFH-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-// RV64_ZVFH-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-// RV64_ZVFH: [[MIDDLE_BLOCK]]:
-// RV64_ZVFH-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i64 [[N_VEC]], 0
-// RV64_ZVFH-NEXT: br i1 [[CMP_N_NOT]], label %[[FOR_BODY_PREHEADER]], label %[[FOR_COND_CLEANUP:.*]]
-// RV64_ZVFH: [[FOR_BODY_PREHEADER]]:
-// RV64_ZVFH-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
-// RV64_ZVFH-NEXT: br label %[[FOR_BODY:.*]]
-// RV64_ZVFH: [[FOR_COND_CLEANUP]]:
-// RV64_ZVFH-NEXT: ret void
-// RV64_ZVFH: [[FOR_BODY]]:
-// RV64_ZVFH-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER]] ]
-// RV64_ZVFH-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDVARS_IV]]
-// RV64_ZVFH-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX]], align 2, !tbaa [[TBAA22]]
-// RV64_ZVFH-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDVARS_IV]]
-// RV64_ZVFH-NEXT: [[TMP10:%.*]] = load half, ptr [[ARRAYIDX2]], align 2, !tbaa [[TBAA22]]
-// RV64_ZVFH-NEXT: [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
-// RV64_ZVFH-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDVARS_IV]]
-// RV64_ZVFH-NEXT: store half [[TMP11]], ptr [[ARRAYIDX4]], align 2, !tbaa [[TBAA22]]
-// RV64_ZVFH-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-// RV64_ZVFH-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-// RV64_ZVFH-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
-//
-void f16max() {
- for (int i=0; i<4096; i++) {cf16[i] = __builtin_fmaximum_numf16(af16[i], bf16[i]);}
-}
-
-//.
-// ARMV8: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// ARMV8: [[META7]] = !{!"float", [[META8:![0-9]+]], i64 0}
-// ARMV8: [[META8]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
-// ARMV8: [[META9]] = !{!"Simple C/C++ TBAA"}
-// ARMV8: [[LOOP10]] = distinct !{[[LOOP10]], [[META11:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]]}
-// ARMV8: [[META11]] = !{!"llvm.loop.mustprogress"}
-// ARMV8: [[META12]] = !{!"llvm.loop.isvectorized", i32 1}
-// ARMV8: [[META13]] = !{!"llvm.loop.unroll.runtime.disable"}
-// ARMV8: [[LOOP14]] = distinct !{[[LOOP14]], [[META11]], [[META12]], [[META13]]}
-// ARMV8: [[TBAA15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
-// ARMV8: [[META16]] = !{!"double", [[META8]], i64 0}
-// ARMV8: [[LOOP17]] = distinct !{[[LOOP17]], [[META11]], [[META12]], [[META13]]}
-// ARMV8: [[LOOP18]] = distinct !{[[LOOP18]], [[META11]], [[META12]], [[META13]]}
-// ARMV8: [[TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
-// ARMV8: [[META20]] = !{!"__fp16", [[META8]], i64 0}
-// ARMV8: [[LOOP21]] = distinct !{[[LOOP21]], [[META11]], [[META12]], [[META13]]}
-// ARMV8: [[LOOP22]] = distinct !{[[LOOP22]], [[META11]], [[META12]], [[META13]]}
-//.
-// RV64_ZVFH: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
-// RV64_ZVFH: [[META10]] = !{!"float", [[META11:![0-9]+]], i64 0}
-// RV64_ZVFH: [[META11]] = !{!"omnipotent char", [[META12:![0-9]+]], i64 0}
-// RV64_ZVFH: [[META12]] = !{!"Simple C/C++ TBAA"}
-// RV64_ZVFH: [[LOOP13]] = distinct !{[[LOOP13]], [[META14:![0-9]+]], [[META15:![0-9]+]], [[META16:![0-9]+]]}
-// RV64_ZVFH: [[META14]] = !{!"llvm.loop.mustprogress"}
-// RV64_ZVFH: [[META15]] = !{!"llvm.loop.isvectorized", i32 1}
-// RV64_ZVFH: [[META16]] = !{!"llvm.loop.unroll.runtime.disable"}
-// RV64_ZVFH: [[LOOP17]] = distinct !{[[LOOP17]], [[META14]], [[META15]], [[META16]]}
-// RV64_ZVFH: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
-// RV64_ZVFH: [[META19]] = !{!"double", [[META11]], i64 0}
-// RV64_ZVFH: [[LOOP20]] = distinct !{[[LOOP20]], [[META14]], [[META15]], [[META16]]}
-// RV64_ZVFH: [[LOOP21]] = distinct !{[[LOOP21]], [[META14]], [[META15]], [[META16]]}
-// RV64_ZVFH: [[TBAA22]] = !{[[META23:![0-9]+]], [[META23]], i64 0}
-// RV64_ZVFH: [[META23]] = !{!"__fp16", [[META11]], i64 0}
-// RV64_ZVFH: [[LOOP24]] = distinct !{[[LOOP24]], [[META14]], [[META15]], [[META16]]}
-// RV64_ZVFH: [[LOOP25]] = distinct !{[[LOOP25]], [[META14]], [[META16]], [[META15]]}
-// RV64_ZVFH: [[LOOP26]] = distinct !{[[LOOP26]], [[META14]], [[META15]], [[META16]]}
-// RV64_ZVFH: [[LOOP27]] = distinct !{[[LOOP27]], [[META14]], [[META16]], [[META15]]}
-//.
diff --git a/llvm/test/Transforms/LoopVectorize/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/fminimumnum.ll
new file mode 100644
index 0000000000000..59375d5fb4f8d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/fminimumnum.ll
@@ -0,0 +1,1059 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --passes=loop-vectorize --mtriple=riscv64 -mattr="+zvfh,+v" -S < %s | FileCheck %s --check-prefix=RV64
+; RUN: opt --passes=loop-vectorize --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s --check-prefix=ARM64
+; FIXME: ARM64+SVE cannot output vscale style code
+; RUN: opt --passes=loop-vectorize --mtriple=x86_64 -S < %s | FileCheck %s --check-prefix=X64
+
+ at af32 = dso_local local_unnamed_addr global [4096 x float] zeroinitializer, align 4
+ at bf32 = dso_local local_unnamed_addr global [4096 x float] zeroinitializer, align 4
+ at cf32 = dso_local local_unnamed_addr global [4096 x float] zeroinitializer, align 4
+ at af64 = dso_local local_unnamed_addr global [4096 x double] zeroinitializer, align 8
+ at bf64 = dso_local local_unnamed_addr global [4096 x double] zeroinitializer, align 8
+ at cf64 = dso_local local_unnamed_addr global [4096 x double] zeroinitializer, align 8
+ at af16 = dso_local local_unnamed_addr global [4096 x half] zeroinitializer, align 2
+ at bf16 = dso_local local_unnamed_addr global [4096 x half] zeroinitializer, align 2
+ at cf16 = dso_local local_unnamed_addr global [4096 x half] zeroinitializer, align 2
+
+; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable
+define dso_local void @f32min() local_unnamed_addr {
+; RV64-LABEL: define dso_local void @f32min(
+; RV64-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; RV64-NEXT: [[ENTRY:.*]]:
+; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP1]]
+; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP3]]
+; RV64-NEXT: [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
+; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 0
+; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
+; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 0
+; RV64-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; RV64-NEXT: [[TMP10:%.*]] = call <vscale x 4 x float> @llvm.minimumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD1]])
+; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 0
+; RV64-NEXT: store <vscale x 4 x float> [[TMP10]], ptr [[TMP12]], align 4
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV64-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, [[N_VEC]]
+; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_COND_CLEANUP]]:
+; RV64-NEXT: ret void
+; RV64: [[FOR_BODY]]:
+; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; RV64-NEXT: [[TMP16:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP14]], float [[TMP15]])
+; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: store float [[TMP16]], ptr [[ARRAYIDX4]], align 4
+; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; ARM64-LABEL: define dso_local void @f32min(
+; ARM64-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; ARM64-NEXT: [[ENTRY:.*]]:
+; ARM64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ARM64: [[VECTOR_PH]]:
+; ARM64-NEXT: br label %[[VECTOR_BODY:.*]]
+; ARM64: [[VECTOR_BODY]]:
+; ARM64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ARM64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 0
+; ARM64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 4
+; ARM64-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; ARM64-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; ARM64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0
+; ARM64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 4
+; ARM64-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; ARM64-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; ARM64-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD2]])
+; ARM64-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD3]])
+; ARM64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 0
+; ARM64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 4
+; ARM64-NEXT: store <4 x float> [[TMP6]], ptr [[TMP9]], align 4
+; ARM64-NEXT: store <4 x float> [[TMP7]], ptr [[TMP10]], align 4
+; ARM64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; ARM64-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; ARM64-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; ARM64: [[MIDDLE_BLOCK]]:
+; ARM64-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; ARM64: [[SCALAR_PH]]:
+; ARM64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ARM64-NEXT: br label %[[FOR_BODY:.*]]
+; ARM64: [[FOR_COND_CLEANUP]]:
+; ARM64-NEXT: ret void
+; ARM64: [[FOR_BODY]]:
+; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; ARM64-NEXT: [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
+; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; X64-LABEL: define dso_local void @f32min() local_unnamed_addr {
+; X64-NEXT: [[ENTRY:.*]]:
+; X64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; X64: [[VECTOR_PH]]:
+; X64-NEXT: br label %[[VECTOR_BODY:.*]]
+; X64: [[VECTOR_BODY]]:
+; X64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; X64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 0
+; X64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 4
+; X64-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; X64-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; X64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0
+; X64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 4
+; X64-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; X64-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; X64-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD2]])
+; X64-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD3]])
+; X64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 0
+; X64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 4
+; X64-NEXT: store <4 x float> [[TMP6]], ptr [[TMP9]], align 4
+; X64-NEXT: store <4 x float> [[TMP7]], ptr [[TMP10]], align 4
+; X64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; X64-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; X64-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; X64: [[MIDDLE_BLOCK]]:
+; X64-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; X64: [[SCALAR_PH]]:
+; X64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; X64-NEXT: br label %[[FOR_BODY:.*]]
+; X64: [[FOR_COND_CLEANUP]]:
+; X64-NEXT: ret void
+; X64: [[FOR_BODY]]:
+; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; X64-NEXT: [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
+; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 %indvars.iv
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 %indvars.iv
+ %1 = load float, ptr %arrayidx2, align 4
+ %2 = tail call float @llvm.minimumnum.f32(float %0, float %1)
+ %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 %indvars.iv
+ store float %2, ptr %arrayidx4, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.minimumnum.f32(float, float)
+
+; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable
+define dso_local void @f32max() local_unnamed_addr {
+; RV64-LABEL: define dso_local void @f32max(
+; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; RV64-NEXT: [[ENTRY:.*]]:
+; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP1]]
+; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP3]]
+; RV64-NEXT: [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
+; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 0
+; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
+; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 0
+; RV64-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; RV64-NEXT: [[TMP10:%.*]] = call <vscale x 4 x float> @llvm.maximumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD1]])
+; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 0
+; RV64-NEXT: store <vscale x 4 x float> [[TMP10]], ptr [[TMP12]], align 4
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV64-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, [[N_VEC]]
+; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_COND_CLEANUP]]:
+; RV64-NEXT: ret void
+; RV64: [[FOR_BODY]]:
+; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; RV64-NEXT: [[TMP16:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP14]], float [[TMP15]])
+; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: store float [[TMP16]], ptr [[ARRAYIDX4]], align 4
+; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; ARM64-LABEL: define dso_local void @f32max(
+; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; ARM64-NEXT: [[ENTRY:.*]]:
+; ARM64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ARM64: [[VECTOR_PH]]:
+; ARM64-NEXT: br label %[[VECTOR_BODY:.*]]
+; ARM64: [[VECTOR_BODY]]:
+; ARM64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ARM64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 0
+; ARM64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 4
+; ARM64-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; ARM64-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; ARM64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0
+; ARM64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 4
+; ARM64-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; ARM64-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; ARM64-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD2]])
+; ARM64-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD3]])
+; ARM64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 0
+; ARM64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 4
+; ARM64-NEXT: store <4 x float> [[TMP6]], ptr [[TMP9]], align 4
+; ARM64-NEXT: store <4 x float> [[TMP7]], ptr [[TMP10]], align 4
+; ARM64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; ARM64-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; ARM64-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; ARM64: [[MIDDLE_BLOCK]]:
+; ARM64-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; ARM64: [[SCALAR_PH]]:
+; ARM64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ARM64-NEXT: br label %[[FOR_BODY:.*]]
+; ARM64: [[FOR_COND_CLEANUP]]:
+; ARM64-NEXT: ret void
+; ARM64: [[FOR_BODY]]:
+; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; ARM64-NEXT: [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
+; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; X64-LABEL: define dso_local void @f32max() local_unnamed_addr {
+; X64-NEXT: [[ENTRY:.*]]:
+; X64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; X64: [[VECTOR_PH]]:
+; X64-NEXT: br label %[[VECTOR_BODY:.*]]
+; X64: [[VECTOR_BODY]]:
+; X64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; X64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 0
+; X64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 4
+; X64-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; X64-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; X64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0
+; X64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 4
+; X64-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; X64-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; X64-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD2]])
+; X64-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD3]])
+; X64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 0
+; X64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 4
+; X64-NEXT: store <4 x float> [[TMP6]], ptr [[TMP9]], align 4
+; X64-NEXT: store <4 x float> [[TMP7]], ptr [[TMP10]], align 4
+; X64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; X64-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; X64-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; X64: [[MIDDLE_BLOCK]]:
+; X64-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; X64: [[SCALAR_PH]]:
+; X64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; X64-NEXT: br label %[[FOR_BODY:.*]]
+; X64: [[FOR_COND_CLEANUP]]:
+; X64-NEXT: ret void
+; X64: [[FOR_BODY]]:
+; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; X64-NEXT: [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
+; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds nuw [4096 x float], ptr @af32, i64 0, i64 %indvars.iv
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr @bf32, i64 0, i64 %indvars.iv
+ %1 = load float, ptr %arrayidx2, align 4
+ %2 = tail call float @llvm.maximumnum.f32(float %0, float %1)
+ %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr @cf32, i64 0, i64 %indvars.iv
+ store float %2, ptr %arrayidx4, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.maximumnum.f32(float, float)
+
+; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable
+define dso_local void @f64min() local_unnamed_addr {
+; RV64-LABEL: define dso_local void @f64min(
+; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; RV64-NEXT: [[ENTRY:.*]]:
+; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP1]]
+; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP3]]
+; RV64-NEXT: [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
+; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw double, ptr [[TMP6]], i32 0
+; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP7]], align 8
+; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw double, ptr [[TMP8]], i32 0
+; RV64-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x double>, ptr [[TMP9]], align 8
+; RV64-NEXT: [[TMP10:%.*]] = call <vscale x 2 x double> @llvm.minimumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD1]])
+; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw double, ptr [[TMP11]], i32 0
+; RV64-NEXT: store <vscale x 2 x double> [[TMP10]], ptr [[TMP12]], align 8
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV64-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, [[N_VEC]]
+; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_COND_CLEANUP]]:
+; RV64-NEXT: ret void
+; RV64: [[FOR_BODY]]:
+; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; RV64-NEXT: [[TMP16:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP14]], double [[TMP15]])
+; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: store double [[TMP16]], ptr [[ARRAYIDX4]], align 8
+; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; ARM64-LABEL: define dso_local void @f64min(
+; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; ARM64-NEXT: [[ENTRY:.*]]:
+; ARM64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ARM64: [[VECTOR_PH]]:
+; ARM64-NEXT: br label %[[VECTOR_BODY:.*]]
+; ARM64: [[VECTOR_BODY]]:
+; ARM64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ARM64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 0
+; ARM64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 2
+; ARM64-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; ARM64-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
+; ARM64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 0
+; ARM64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 2
+; ARM64-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
+; ARM64-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
+; ARM64-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD2]])
+; ARM64-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD1]], <2 x double> [[WIDE_LOAD3]])
+; ARM64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw double, ptr [[TMP8]], i32 0
+; ARM64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw double, ptr [[TMP8]], i32 2
+; ARM64-NEXT: store <2 x double> [[TMP6]], ptr [[TMP9]], align 8
+; ARM64-NEXT: store <2 x double> [[TMP7]], ptr [[TMP10]], align 8
+; ARM64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; ARM64-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; ARM64-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; ARM64: [[MIDDLE_BLOCK]]:
+; ARM64-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; ARM64: [[SCALAR_PH]]:
+; ARM64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ARM64-NEXT: br label %[[FOR_BODY:.*]]
+; ARM64: [[FOR_COND_CLEANUP]]:
+; ARM64-NEXT: ret void
+; ARM64: [[FOR_BODY]]:
+; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; ARM64-NEXT: [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
+; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; X64-LABEL: define dso_local void @f64min() local_unnamed_addr {
+; X64-NEXT: [[ENTRY:.*]]:
+; X64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; X64: [[VECTOR_PH]]:
+; X64-NEXT: br label %[[VECTOR_BODY:.*]]
+; X64: [[VECTOR_BODY]]:
+; X64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; X64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 0
+; X64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 2
+; X64-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; X64-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
+; X64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 0
+; X64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 2
+; X64-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
+; X64-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
+; X64-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD2]])
+; X64-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD1]], <2 x double> [[WIDE_LOAD3]])
+; X64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw double, ptr [[TMP8]], i32 0
+; X64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw double, ptr [[TMP8]], i32 2
+; X64-NEXT: store <2 x double> [[TMP6]], ptr [[TMP9]], align 8
+; X64-NEXT: store <2 x double> [[TMP7]], ptr [[TMP10]], align 8
+; X64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; X64-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; X64-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; X64: [[MIDDLE_BLOCK]]:
+; X64-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; X64: [[SCALAR_PH]]:
+; X64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; X64-NEXT: br label %[[FOR_BODY:.*]]
+; X64: [[FOR_COND_CLEANUP]]:
+; X64-NEXT: ret void
+; X64: [[FOR_BODY]]:
+; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; X64-NEXT: [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
+; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 %indvars.iv
+ %0 = load double, ptr %arrayidx, align 8
+ %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 %indvars.iv
+ %1 = load double, ptr %arrayidx2, align 8
+ %2 = tail call double @llvm.minimumnum.f64(double %0, double %1)
+ %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 %indvars.iv
+ store double %2, ptr %arrayidx4, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare double @llvm.minimumnum.f64(double, double)
+
+; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable
+define dso_local void @f64max() local_unnamed_addr {
+; RV64-LABEL: define dso_local void @f64max(
+; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; RV64-NEXT: [[ENTRY:.*]]:
+; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP1]]
+; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP3]]
+; RV64-NEXT: [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
+; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw double, ptr [[TMP6]], i32 0
+; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP7]], align 8
+; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw double, ptr [[TMP8]], i32 0
+; RV64-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x double>, ptr [[TMP9]], align 8
+; RV64-NEXT: [[TMP10:%.*]] = call <vscale x 2 x double> @llvm.maximumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD1]])
+; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw double, ptr [[TMP11]], i32 0
+; RV64-NEXT: store <vscale x 2 x double> [[TMP10]], ptr [[TMP12]], align 8
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV64-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, [[N_VEC]]
+; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_COND_CLEANUP]]:
+; RV64-NEXT: ret void
+; RV64: [[FOR_BODY]]:
+; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; RV64-NEXT: [[TMP16:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP14]], double [[TMP15]])
+; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: store double [[TMP16]], ptr [[ARRAYIDX4]], align 8
+; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+; ARM64-LABEL: define dso_local void @f64max(
+; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; ARM64-NEXT: [[ENTRY:.*]]:
+; ARM64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ARM64: [[VECTOR_PH]]:
+; ARM64-NEXT: br label %[[VECTOR_BODY:.*]]
+; ARM64: [[VECTOR_BODY]]:
+; ARM64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ARM64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 0
+; ARM64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 2
+; ARM64-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; ARM64-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
+; ARM64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 0
+; ARM64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 2
+; ARM64-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
+; ARM64-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
+; ARM64-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD2]])
+; ARM64-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD1]], <2 x double> [[WIDE_LOAD3]])
+; ARM64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw double, ptr [[TMP8]], i32 0
+; ARM64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw double, ptr [[TMP8]], i32 2
+; ARM64-NEXT: store <2 x double> [[TMP6]], ptr [[TMP9]], align 8
+; ARM64-NEXT: store <2 x double> [[TMP7]], ptr [[TMP10]], align 8
+; ARM64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; ARM64-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; ARM64-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; ARM64: [[MIDDLE_BLOCK]]:
+; ARM64-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; ARM64: [[SCALAR_PH]]:
+; ARM64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ARM64-NEXT: br label %[[FOR_BODY:.*]]
+; ARM64: [[FOR_COND_CLEANUP]]:
+; ARM64-NEXT: ret void
+; ARM64: [[FOR_BODY]]:
+; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; ARM64-NEXT: [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
+; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+; X64-LABEL: define dso_local void @f64max() local_unnamed_addr {
+; X64-NEXT: [[ENTRY:.*]]:
+; X64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; X64: [[VECTOR_PH]]:
+; X64-NEXT: br label %[[VECTOR_BODY:.*]]
+; X64: [[VECTOR_BODY]]:
+; X64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; X64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 0
+; X64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 2
+; X64-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; X64-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
+; X64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 0
+; X64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 2
+; X64-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
+; X64-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
+; X64-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD2]])
+; X64-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD1]], <2 x double> [[WIDE_LOAD3]])
+; X64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw double, ptr [[TMP8]], i32 0
+; X64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw double, ptr [[TMP8]], i32 2
+; X64-NEXT: store <2 x double> [[TMP6]], ptr [[TMP9]], align 8
+; X64-NEXT: store <2 x double> [[TMP7]], ptr [[TMP10]], align 8
+; X64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; X64-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; X64-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; X64: [[MIDDLE_BLOCK]]:
+; X64-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; X64: [[SCALAR_PH]]:
+; X64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; X64-NEXT: br label %[[FOR_BODY:.*]]
+; X64: [[FOR_COND_CLEANUP]]:
+; X64-NEXT: ret void
+; X64: [[FOR_BODY]]:
+; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; X64-NEXT: [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
+; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds nuw [4096 x double], ptr @af64, i64 0, i64 %indvars.iv
+ %0 = load double, ptr %arrayidx, align 8
+ %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr @bf64, i64 0, i64 %indvars.iv
+ %1 = load double, ptr %arrayidx2, align 8
+ %2 = tail call double @llvm.maximumnum.f64(double %0, double %1)
+ %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr @cf64, i64 0, i64 %indvars.iv
+ store double %2, ptr %arrayidx4, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare double @llvm.maximumnum.f64(double, double)
+
+; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable
+define dso_local void @f16min() local_unnamed_addr {
+; RV64-LABEL: define dso_local void @f16min(
+; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; RV64-NEXT: [[ENTRY:.*]]:
+; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP1]]
+; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP3]]
+; RV64-NEXT: [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
+; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw half, ptr [[TMP6]], i32 0
+; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP7]], align 2
+; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw half, ptr [[TMP8]], i32 0
+; RV64-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP9]], align 2
+; RV64-NEXT: [[TMP10:%.*]] = call <vscale x 8 x half> @llvm.minimumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD1]])
+; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw half, ptr [[TMP11]], i32 0
+; RV64-NEXT: store <vscale x 8 x half> [[TMP10]], ptr [[TMP12]], align 2
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV64-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, [[N_VEC]]
+; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_COND_CLEANUP]]:
+; RV64-NEXT: ret void
+; RV64: [[FOR_BODY]]:
+; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: [[TMP14:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: [[TMP15:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; RV64-NEXT: [[TMP16:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP14]], half [[TMP15]])
+; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: store half [[TMP16]], ptr [[ARRAYIDX4]], align 2
+; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+;
+; ARM64-LABEL: define dso_local void @f16min(
+; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; ARM64-NEXT: [[ENTRY:.*]]:
+; ARM64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ARM64: [[VECTOR_PH]]:
+; ARM64-NEXT: br label %[[VECTOR_BODY:.*]]
+; ARM64: [[VECTOR_BODY]]:
+; ARM64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ARM64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw half, ptr [[TMP0]], i32 0
+; ARM64-NEXT: [[WIDE_LOAD:%.*]] = load <2 x half>, ptr [[TMP1]], align 2
+; ARM64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw half, ptr [[TMP2]], i32 0
+; ARM64-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x half>, ptr [[TMP3]], align 2
+; ARM64-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> [[WIDE_LOAD]], <2 x half> [[WIDE_LOAD1]])
+; ARM64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw half, ptr [[TMP5]], i32 0
+; ARM64-NEXT: store <2 x half> [[TMP4]], ptr [[TMP6]], align 2
+; ARM64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; ARM64-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; ARM64-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; ARM64: [[MIDDLE_BLOCK]]:
+; ARM64-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; ARM64: [[SCALAR_PH]]:
+; ARM64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ARM64-NEXT: br label %[[FOR_BODY:.*]]
+; ARM64: [[FOR_COND_CLEANUP]]:
+; ARM64-NEXT: ret void
+; ARM64: [[FOR_BODY]]:
+; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; ARM64-NEXT: [[TMP10:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP8]], half [[TMP9]])
+; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
+; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+;
+; X64-LABEL: define dso_local void @f16min() local_unnamed_addr {
+; X64-NEXT: [[ENTRY:.*]]:
+; X64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; X64: [[VECTOR_PH]]:
+; X64-NEXT: br label %[[VECTOR_BODY:.*]]
+; X64: [[VECTOR_BODY]]:
+; X64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; X64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw half, ptr [[TMP0]], i32 0
+; X64-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP1]], align 2
+; X64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw half, ptr [[TMP2]], i32 0
+; X64-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP3]], align 2
+; X64-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD1]])
+; X64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw half, ptr [[TMP5]], i32 0
+; X64-NEXT: store <8 x half> [[TMP4]], ptr [[TMP6]], align 2
+; X64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; X64-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; X64-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; X64: [[MIDDLE_BLOCK]]:
+; X64-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; X64: [[SCALAR_PH]]:
+; X64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; X64-NEXT: br label %[[FOR_BODY:.*]]
+; X64: [[FOR_COND_CLEANUP]]:
+; X64-NEXT: ret void
+; X64: [[FOR_BODY]]:
+; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; X64-NEXT: [[TMP10:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP8]], half [[TMP9]])
+; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
+; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 %indvars.iv
+ %0 = load half, ptr %arrayidx, align 2
+ %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 %indvars.iv
+ %1 = load half, ptr %arrayidx2, align 2
+ %2 = tail call half @llvm.minimumnum.f16(half %0, half %1)
+ %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 %indvars.iv
+ store half %2, ptr %arrayidx4, align 2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare half @llvm.minimumnum.f16(half, half)
+
+; Function Attrs: nofree norecurse nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none) uwtable
+define dso_local void @f16max() local_unnamed_addr {
+; RV64-LABEL: define dso_local void @f16max(
+; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; RV64-NEXT: [[ENTRY:.*]]:
+; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 4096, [[TMP1]]
+; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64: [[VECTOR_PH]]:
+; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 4096, [[TMP3]]
+; RV64-NEXT: [[N_VEC:%.*]] = sub i64 4096, [[N_MOD_VF]]
+; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; RV64-NEXT: br label %[[VECTOR_BODY:.*]]
+; RV64: [[VECTOR_BODY]]:
+; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw half, ptr [[TMP6]], i32 0
+; RV64-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP7]], align 2
+; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw half, ptr [[TMP8]], i32 0
+; RV64-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP9]], align 2
+; RV64-NEXT: [[TMP10:%.*]] = call <vscale x 8 x half> @llvm.maximumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD1]])
+; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
+; RV64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw half, ptr [[TMP11]], i32 0
+; RV64-NEXT: store <vscale x 8 x half> [[TMP10]], ptr [[TMP12]], align 2
+; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV64-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; RV64: [[MIDDLE_BLOCK]]:
+; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, [[N_VEC]]
+; RV64-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; RV64: [[SCALAR_PH]]:
+; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; RV64-NEXT: br label %[[FOR_BODY:.*]]
+; RV64: [[FOR_COND_CLEANUP]]:
+; RV64-NEXT: ret void
+; RV64: [[FOR_BODY]]:
+; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: [[TMP14:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: [[TMP15:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; RV64-NEXT: [[TMP16:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP14]], half [[TMP15]])
+; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT: store half [[TMP16]], ptr [[ARRAYIDX4]], align 2
+; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+;
+; ARM64-LABEL: define dso_local void @f16max(
+; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; ARM64-NEXT: [[ENTRY:.*]]:
+; ARM64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ARM64: [[VECTOR_PH]]:
+; ARM64-NEXT: br label %[[VECTOR_BODY:.*]]
+; ARM64: [[VECTOR_BODY]]:
+; ARM64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ARM64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw half, ptr [[TMP0]], i32 0
+; ARM64-NEXT: [[WIDE_LOAD:%.*]] = load <2 x half>, ptr [[TMP1]], align 2
+; ARM64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw half, ptr [[TMP2]], i32 0
+; ARM64-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x half>, ptr [[TMP3]], align 2
+; ARM64-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> [[WIDE_LOAD]], <2 x half> [[WIDE_LOAD1]])
+; ARM64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
+; ARM64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw half, ptr [[TMP5]], i32 0
+; ARM64-NEXT: store <2 x half> [[TMP4]], ptr [[TMP6]], align 2
+; ARM64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; ARM64-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; ARM64-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; ARM64: [[MIDDLE_BLOCK]]:
+; ARM64-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; ARM64: [[SCALAR_PH]]:
+; ARM64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ARM64-NEXT: br label %[[FOR_BODY:.*]]
+; ARM64: [[FOR_COND_CLEANUP]]:
+; ARM64-NEXT: ret void
+; ARM64: [[FOR_BODY]]:
+; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; ARM64-NEXT: [[TMP10:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP8]], half [[TMP9]])
+; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT: store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
+; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+;
+; X64-LABEL: define dso_local void @f16max() local_unnamed_addr {
+; X64-NEXT: [[ENTRY:.*]]:
+; X64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; X64: [[VECTOR_PH]]:
+; X64-NEXT: br label %[[VECTOR_BODY:.*]]
+; X64: [[VECTOR_BODY]]:
+; X64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; X64-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw half, ptr [[TMP0]], i32 0
+; X64-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP1]], align 2
+; X64-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw half, ptr [[TMP2]], i32 0
+; X64-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP3]], align 2
+; X64-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD1]])
+; X64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDEX]]
+; X64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw half, ptr [[TMP5]], i32 0
+; X64-NEXT: store <8 x half> [[TMP4]], ptr [[TMP6]], align 2
+; X64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; X64-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; X64-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; X64: [[MIDDLE_BLOCK]]:
+; X64-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; X64: [[SCALAR_PH]]:
+; X64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; X64-NEXT: br label %[[FOR_BODY:.*]]
+; X64: [[FOR_COND_CLEANUP]]:
+; X64-NEXT: ret void
+; X64: [[FOR_BODY]]:
+; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; X64-NEXT: [[TMP10:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP8]], half [[TMP9]])
+; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT: store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
+; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds nuw [4096 x half], ptr @af16, i64 0, i64 %indvars.iv
+ %0 = load half, ptr %arrayidx, align 2
+ %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr @bf16, i64 0, i64 %indvars.iv
+ %1 = load half, ptr %arrayidx2, align 2
+ %2 = tail call half @llvm.maximumnum.f16(half %0, half %1)
+ %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr @cf16, i64 0, i64 %indvars.iv
+ store half %2, ptr %arrayidx4, align 2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare half @llvm.maximumnum.f16(half, half)
+;.
+; RV64: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; RV64: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; RV64: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; RV64: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; RV64: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; RV64: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; RV64: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; RV64: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; RV64: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; RV64: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; RV64: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; RV64: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; RV64: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; RV64: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.
+; ARM64: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; ARM64: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; ARM64: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; ARM64: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; ARM64: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; ARM64: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; ARM64: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; ARM64: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; ARM64: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; ARM64: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; ARM64: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; ARM64: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; ARM64: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; ARM64: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.
+; X64: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; X64: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; X64: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; X64: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; X64: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; X64: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; X64: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; X64: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; X64: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; X64: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; X64: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; X64: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; X64: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; X64: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.
More information about the llvm-commits
mailing list