[llvm] LLVM/Test: Add vectorizing testcases for fminimumnum and fminimumnum (PR #133843)

YunQiang Su via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 31 21:36:04 PDT 2025


https://github.com/wzssyqa updated https://github.com/llvm/llvm-project/pull/133843

>From 31400324d52f2b30e0c2e86db8fae662edcc80b1 Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqiang at isrc.iscas.ac.cn>
Date: Mon, 31 Mar 2025 17:38:57 +0800
Subject: [PATCH 1/2] LLVM/Test: Add vectorizing testcases for fminimumnum and
 fminimumnum

Vectorizing of fminimumnum and fminimumnum have not support yet.
Let's add the testcase for it now, and we will update the testcase
when we support it.

add slp test
---
 .../LoopVectorize/AArch64/fminimumnum.ll      | 265 +++++++++
 .../LoopVectorize/RISCV/fminimumnum.ll        | 265 +++++++++
 .../LoopVectorize/X86/fminimumnum.ll          | 259 +++++++++
 .../SLPVectorizer/AArch64/fminimumnum.ll      | 516 ++++++++++++++++++
 .../SLPVectorizer/RISCV/fminimumnum.ll        | 516 ++++++++++++++++++
 .../SLPVectorizer/X86/fminimumnum.ll          | 510 +++++++++++++++++
 6 files changed, 2331 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll
new file mode 100644
index 0000000000000..16968b3d11420
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll
@@ -0,0 +1,265 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet.
+; RUN: opt --passes=loop-vectorize --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s --check-prefix=ARM64
+
+ at input1_f32 = global [4096 x float] zeroinitializer, align 4
+ at input2_f32 = global [4096 x float] zeroinitializer, align 4
+ at output_f32 = global [4096 x float] zeroinitializer, align 4
+ at input1_f64 = global [4096 x double] zeroinitializer, align 8
+ at input2_f64 = global [4096 x double] zeroinitializer, align 8
+ at output_f64 = global [4096 x double] zeroinitializer, align 8
+ at input1_f16 = global [4096 x half] zeroinitializer, align 2
+ at input2_f16 = global [4096 x half] zeroinitializer, align 2
+ at output_f16 = global [4096 x half] zeroinitializer, align 2
+
+define void @f32min()  {
+; ARM64-LABEL: define void @f32min(
+; ARM64-SAME: ) #[[ATTR0:[0-9]+]] {
+; ARM64-NEXT:  [[ENTRY:.*]]:
+; ARM64-NEXT:    br label %[[FOR_BODY:.*]]
+; ARM64:       [[FOR_COND_CLEANUP:.*]]:
+; ARM64-NEXT:    ret void
+; ARM64:       [[FOR_BODY]]:
+; ARM64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; ARM64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; ARM64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; ARM64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; ARM64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
+; ARM64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; ARM64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; ARM64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 %indvars.iv
+  %input1 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 %indvars.iv
+  %input2 = load float, ptr %arrayidx2, align 4
+  %output = tail call float @llvm.minimumnum.f32(float %input1, float %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 %indvars.iv
+  store float %output, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare float @llvm.minimumnum.f32(float, float)
+
+define void @f32max()  {
+; ARM64-LABEL: define void @f32max(
+; ARM64-SAME: ) #[[ATTR0]] {
+; ARM64-NEXT:  [[ENTRY:.*]]:
+; ARM64-NEXT:    br label %[[FOR_BODY:.*]]
+; ARM64:       [[FOR_COND_CLEANUP:.*]]:
+; ARM64-NEXT:    ret void
+; ARM64:       [[FOR_BODY]]:
+; ARM64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; ARM64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; ARM64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; ARM64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; ARM64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
+; ARM64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; ARM64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; ARM64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 %indvars.iv
+  %input1 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 %indvars.iv
+  %input2 = load float, ptr %arrayidx2, align 4
+  %output = tail call float @llvm.maximumnum.f32(float %input1, float %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 %indvars.iv
+  store float %output, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare float @llvm.maximumnum.f32(float, float)
+
+define void @f64min()  {
+; ARM64-LABEL: define void @f64min(
+; ARM64-SAME: ) #[[ATTR0]] {
+; ARM64-NEXT:  [[ENTRY:.*]]:
+; ARM64-NEXT:    br label %[[FOR_BODY:.*]]
+; ARM64:       [[FOR_COND_CLEANUP:.*]]:
+; ARM64-NEXT:    ret void
+; ARM64:       [[FOR_BODY]]:
+; ARM64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; ARM64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; ARM64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; ARM64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; ARM64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
+; ARM64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; ARM64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; ARM64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 %indvars.iv
+  %input1 = load double, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 %indvars.iv
+  %input2 = load double, ptr %arrayidx2, align 8
+  %output = tail call double @llvm.minimumnum.f64(double %input1, double %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 %indvars.iv
+  store double %output, ptr %arrayidx4, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare double @llvm.minimumnum.f64(double, double)
+
+define void @f64max()  {
+; ARM64-LABEL: define void @f64max(
+; ARM64-SAME: ) #[[ATTR0]] {
+; ARM64-NEXT:  [[ENTRY:.*]]:
+; ARM64-NEXT:    br label %[[FOR_BODY:.*]]
+; ARM64:       [[FOR_COND_CLEANUP:.*]]:
+; ARM64-NEXT:    ret void
+; ARM64:       [[FOR_BODY]]:
+; ARM64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; ARM64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; ARM64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; ARM64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; ARM64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
+; ARM64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; ARM64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; ARM64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 %indvars.iv
+  %input1 = load double, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 %indvars.iv
+  %input2 = load double, ptr %arrayidx2, align 8
+  %output = tail call double @llvm.maximumnum.f64(double %input1, double %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 %indvars.iv
+  store double %output, ptr %arrayidx4, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare double @llvm.maximumnum.f64(double, double)
+
+define void @f16min()  {
+; ARM64-LABEL: define void @f16min(
+; ARM64-SAME: ) #[[ATTR0]] {
+; ARM64-NEXT:  [[ENTRY:.*]]:
+; ARM64-NEXT:    br label %[[FOR_BODY:.*]]
+; ARM64:       [[FOR_COND_CLEANUP:.*]]:
+; ARM64-NEXT:    ret void
+; ARM64:       [[FOR_BODY]]:
+; ARM64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; ARM64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; ARM64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; ARM64-NEXT:    [[TMP10:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP8]], half [[TMP9]])
+; ARM64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
+; ARM64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; ARM64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; ARM64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 %indvars.iv
+  %input1 = load half, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 %indvars.iv
+  %input2 = load half, ptr %arrayidx2, align 2
+  %output = tail call half @llvm.minimumnum.f16(half %input1, half %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 %indvars.iv
+  store half %output, ptr %arrayidx4, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare half @llvm.minimumnum.f16(half, half)
+
+define void @f16max()  {
+; ARM64-LABEL: define void @f16max(
+; ARM64-SAME: ) #[[ATTR0]] {
+; ARM64-NEXT:  [[ENTRY:.*]]:
+; ARM64-NEXT:    br label %[[FOR_BODY:.*]]
+; ARM64:       [[FOR_COND_CLEANUP:.*]]:
+; ARM64-NEXT:    ret void
+; ARM64:       [[FOR_BODY]]:
+; ARM64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; ARM64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; ARM64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; ARM64-NEXT:    [[TMP10:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP8]], half [[TMP9]])
+; ARM64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
+; ARM64-NEXT:    store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
+; ARM64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; ARM64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; ARM64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 %indvars.iv
+  %input1 = load half, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 %indvars.iv
+  %input2 = load half, ptr %arrayidx2, align 2
+  %output = tail call half @llvm.maximumnum.f16(half %input1, half %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 %indvars.iv
+  store half %output, ptr %arrayidx4, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare half @llvm.maximumnum.f16(half, half)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
new file mode 100644
index 0000000000000..1ff43856a8bc1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
@@ -0,0 +1,265 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet.
+; RUN: opt --passes=loop-vectorize --mtriple=riscv64 -mattr="+zvfh,+v,+zfh" -S < %s | FileCheck %s --check-prefix=RV64
+
+ at input1_f32 = global [4096 x float] zeroinitializer, align 4
+ at input2_f32 = global [4096 x float] zeroinitializer, align 4
+ at output_f32 = global [4096 x float] zeroinitializer, align 4
+ at input1_f64 = global [4096 x double] zeroinitializer, align 8
+ at input2_f64 = global [4096 x double] zeroinitializer, align 8
+ at output_f64 = global [4096 x double] zeroinitializer, align 8
+ at input1_f16 = global [4096 x half] zeroinitializer, align 2
+ at input2_f16 = global [4096 x half] zeroinitializer, align 2
+ at output_f16 = global [4096 x half] zeroinitializer, align 2
+
+define void @f32min()  {
+; RV64-LABEL: define void @f32min(
+; RV64-SAME: ) #[[ATTR0:[0-9]+]] {
+; RV64-NEXT:  [[ENTRY:.*]]:
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_COND_CLEANUP:.*]]:
+; RV64-NEXT:    ret void
+; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; RV64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; RV64-NEXT:    [[TMP16:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP14]], float [[TMP15]])
+; RV64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    store float [[TMP16]], ptr [[ARRAYIDX4]], align 4
+; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; RV64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 %indvars.iv
+  %input1 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 %indvars.iv
+  %input2 = load float, ptr %arrayidx2, align 4
+  %output = tail call float @llvm.minimumnum.f32(float %input1, float %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 %indvars.iv
+  store float %output, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare float @llvm.minimumnum.f32(float, float)
+
+define void @f32max()  {
+; RV64-LABEL: define void @f32max(
+; RV64-SAME: ) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*]]:
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_COND_CLEANUP:.*]]:
+; RV64-NEXT:    ret void
+; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; RV64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; RV64-NEXT:    [[TMP16:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP14]], float [[TMP15]])
+; RV64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    store float [[TMP16]], ptr [[ARRAYIDX4]], align 4
+; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; RV64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 %indvars.iv
+  %input1 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 %indvars.iv
+  %input2 = load float, ptr %arrayidx2, align 4
+  %output = tail call float @llvm.maximumnum.f32(float %input1, float %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 %indvars.iv
+  store float %output, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare float @llvm.maximumnum.f32(float, float)
+
+define void @f64min()  {
+; RV64-LABEL: define void @f64min(
+; RV64-SAME: ) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*]]:
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_COND_CLEANUP:.*]]:
+; RV64-NEXT:    ret void
+; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; RV64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; RV64-NEXT:    [[TMP16:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP14]], double [[TMP15]])
+; RV64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    store double [[TMP16]], ptr [[ARRAYIDX4]], align 8
+; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; RV64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 %indvars.iv
+  %input1 = load double, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 %indvars.iv
+  %input2 = load double, ptr %arrayidx2, align 8
+  %output = tail call double @llvm.minimumnum.f64(double %input1, double %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 %indvars.iv
+  store double %output, ptr %arrayidx4, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare double @llvm.minimumnum.f64(double, double)
+
+define void @f64max()  {
+; RV64-LABEL: define void @f64max(
+; RV64-SAME: ) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*]]:
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_COND_CLEANUP:.*]]:
+; RV64-NEXT:    ret void
+; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; RV64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; RV64-NEXT:    [[TMP16:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP14]], double [[TMP15]])
+; RV64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    store double [[TMP16]], ptr [[ARRAYIDX4]], align 8
+; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; RV64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 %indvars.iv
+  %input1 = load double, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 %indvars.iv
+  %input2 = load double, ptr %arrayidx2, align 8
+  %output = tail call double @llvm.maximumnum.f64(double %input1, double %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 %indvars.iv
+  store double %output, ptr %arrayidx4, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare double @llvm.maximumnum.f64(double, double)
+
+define void @f16min()  {
+; RV64-LABEL: define void @f16min(
+; RV64-SAME: ) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*]]:
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_COND_CLEANUP:.*]]:
+; RV64-NEXT:    ret void
+; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP14:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; RV64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP15:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; RV64-NEXT:    [[TMP16:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP14]], half [[TMP15]])
+; RV64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    store half [[TMP16]], ptr [[ARRAYIDX4]], align 2
+; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; RV64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 %indvars.iv
+  %input1 = load half, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 %indvars.iv
+  %input2 = load half, ptr %arrayidx2, align 2
+  %output = tail call half @llvm.minimumnum.f16(half %input1, half %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 %indvars.iv
+  store half %output, ptr %arrayidx4, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare half @llvm.minimumnum.f16(half, half)
+
+define void @f16max()  {
+; RV64-LABEL: define void @f16max(
+; RV64-SAME: ) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*]]:
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_COND_CLEANUP:.*]]:
+; RV64-NEXT:    ret void
+; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP14:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; RV64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    [[TMP15:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; RV64-NEXT:    [[TMP16:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP14]], half [[TMP15]])
+; RV64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
+; RV64-NEXT:    store half [[TMP16]], ptr [[ARRAYIDX4]], align 2
+; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; RV64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; RV64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 %indvars.iv
+  %input1 = load half, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 %indvars.iv
+  %input2 = load half, ptr %arrayidx2, align 2
+  %output = tail call half @llvm.maximumnum.f16(half %input1, half %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 %indvars.iv
+  store half %output, ptr %arrayidx4, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare half @llvm.maximumnum.f16(half, half)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll
new file mode 100644
index 0000000000000..8f2cd266b6bda
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll
@@ -0,0 +1,259 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet.
+; RUN: opt --passes=loop-vectorize --mtriple=x86_64 -S < %s | FileCheck %s --check-prefix=X64
+
+ at input1_f32 = global [4096 x float] zeroinitializer, align 4
+ at input2_f32 = global [4096 x float] zeroinitializer, align 4
+ at output_f32 = global [4096 x float] zeroinitializer, align 4
+ at input1_f64 = global [4096 x double] zeroinitializer, align 8
+ at input2_f64 = global [4096 x double] zeroinitializer, align 8
+ at output_f64 = global [4096 x double] zeroinitializer, align 8
+ at input1_f16 = global [4096 x half] zeroinitializer, align 2
+ at input2_f16 = global [4096 x half] zeroinitializer, align 2
+ at output_f16 = global [4096 x half] zeroinitializer, align 2
+
+define void @f32min()  {
+; X64-LABEL: define void @f32min() {
+; X64-NEXT:  [[ENTRY:.*]]:
+; X64-NEXT:    br label %[[FOR_BODY:.*]]
+; X64:       [[FOR_COND_CLEANUP:.*]]:
+; X64-NEXT:    ret void
+; X64:       [[FOR_BODY]]:
+; X64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; X64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; X64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; X64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; X64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
+; X64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; X64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; X64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 %indvars.iv
+  %input1 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 %indvars.iv
+  %input2 = load float, ptr %arrayidx2, align 4
+  %output = tail call float @llvm.minimumnum.f32(float %input1, float %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 %indvars.iv
+  store float %output, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare float @llvm.minimumnum.f32(float, float)
+
+define void @f32max()  {
+; X64-LABEL: define void @f32max() {
+; X64-NEXT:  [[ENTRY:.*]]:
+; X64-NEXT:    br label %[[FOR_BODY:.*]]
+; X64:       [[FOR_COND_CLEANUP:.*]]:
+; X64-NEXT:    ret void
+; X64:       [[FOR_BODY]]:
+; X64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; X64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; X64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; X64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; X64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
+; X64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; X64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; X64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 %indvars.iv
+  %input1 = load float, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 %indvars.iv
+  %input2 = load float, ptr %arrayidx2, align 4
+  %output = tail call float @llvm.maximumnum.f32(float %input1, float %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 %indvars.iv
+  store float %output, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare float @llvm.maximumnum.f32(float, float)
+
+define void @f64min()  {
+; X64-LABEL: define void @f64min() {
+; X64-NEXT:  [[ENTRY:.*]]:
+; X64-NEXT:    br label %[[FOR_BODY:.*]]
+; X64:       [[FOR_COND_CLEANUP:.*]]:
+; X64-NEXT:    ret void
+; X64:       [[FOR_BODY]]:
+; X64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; X64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; X64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; X64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; X64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
+; X64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; X64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; X64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 %indvars.iv
+  %input1 = load double, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 %indvars.iv
+  %input2 = load double, ptr %arrayidx2, align 8
+  %output = tail call double @llvm.minimumnum.f64(double %input1, double %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 %indvars.iv
+  store double %output, ptr %arrayidx4, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare double @llvm.minimumnum.f64(double, double)
+
+define void @f64max()  {
+; X64-LABEL: define void @f64max() {
+; X64-NEXT:  [[ENTRY:.*]]:
+; X64-NEXT:    br label %[[FOR_BODY:.*]]
+; X64:       [[FOR_COND_CLEANUP:.*]]:
+; X64-NEXT:    ret void
+; X64:       [[FOR_BODY]]:
+; X64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; X64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; X64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; X64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; X64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
+; X64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; X64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; X64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 %indvars.iv
+  %input1 = load double, ptr %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 %indvars.iv
+  %input2 = load double, ptr %arrayidx2, align 8
+  %output = tail call double @llvm.maximumnum.f64(double %input1, double %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 %indvars.iv
+  store double %output, ptr %arrayidx4, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare double @llvm.maximumnum.f64(double, double)
+
+define void @f16min()  {
+; X64-LABEL: define void @f16min() {
+; X64-NEXT:  [[ENTRY:.*]]:
+; X64-NEXT:    br label %[[FOR_BODY:.*]]
+; X64:       [[FOR_COND_CLEANUP:.*]]:
+; X64-NEXT:    ret void
+; X64:       [[FOR_BODY]]:
+; X64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; X64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; X64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; X64-NEXT:    [[TMP10:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP8]], half [[TMP9]])
+; X64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
+; X64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; X64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; X64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 %indvars.iv
+  %input1 = load half, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 %indvars.iv
+  %input2 = load half, ptr %arrayidx2, align 2
+  %output = tail call half @llvm.minimumnum.f16(half %input1, half %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 %indvars.iv
+  store half %output, ptr %arrayidx4, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare half @llvm.minimumnum.f16(half, half)
+
+define void @f16max()  {
+; X64-LABEL: define void @f16max() {
+; X64-NEXT:  [[ENTRY:.*]]:
+; X64-NEXT:    br label %[[FOR_BODY:.*]]
+; X64:       [[FOR_COND_CLEANUP:.*]]:
+; X64-NEXT:    ret void
+; X64:       [[FOR_BODY]]:
+; X64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; X64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; X64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; X64-NEXT:    [[TMP10:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP8]], half [[TMP9]])
+; X64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
+; X64-NEXT:    store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
+; X64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; X64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; X64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 %indvars.iv
+  %input1 = load half, ptr %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 %indvars.iv
+  %input2 = load half, ptr %arrayidx2, align 2
+  %output = tail call half @llvm.maximumnum.f16(half %input1, half %input2)
+  %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 %indvars.iv
+  store half %output, ptr %arrayidx4, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare half @llvm.maximumnum.f16(half, half)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll
new file mode 100644
index 0000000000000..4acc011df8b18
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll
@@ -0,0 +1,516 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --passes=slp-vectorizer --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s --check-prefix=ARM64
+
+ at input1_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
+ at input2_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
+ at output_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
+ at input1_f64 = dso_local local_unnamed_addr global [9 x double] zeroinitializer, align 16
+ at input2_f64 = dso_local local_unnamed_addr global [9 x double] zeroinitializer, align 16
+ at output_f64 = dso_local local_unnamed_addr global [9 x double] zeroinitializer, align 16
+ at input1_f16 = dso_local local_unnamed_addr global [9 x half] zeroinitializer, align 16
+ at input2_f16 = dso_local local_unnamed_addr global [9 x half] zeroinitializer, align 16
+ at output_f16 = dso_local local_unnamed_addr global [9 x half] zeroinitializer, align 16
+
+define dso_local void @fmin32() local_unnamed_addr  {
+; ARM64-LABEL: define dso_local void @fmin32(
+; ARM64-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; ARM64-NEXT:  [[ENTRY:.*:]]
+; ARM64-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; ARM64-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; ARM64-NEXT:    [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]])
+; ARM64-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; ARM64-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; ARM64-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; ARM64-NEXT:    [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]])
+; ARM64-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; ARM64-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; ARM64-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; ARM64-NEXT:    [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]])
+; ARM64-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; ARM64-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; ARM64-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; ARM64-NEXT:    [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]])
+; ARM64-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; ARM64-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; ARM64-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; ARM64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; ARM64-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; ARM64-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; ARM64-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; ARM64-NEXT:    [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]])
+; ARM64-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; ARM64-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; ARM64-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; ARM64-NEXT:    [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]])
+; ARM64-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; ARM64-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; ARM64-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; ARM64-NEXT:    [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]])
+; ARM64-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; ARM64-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; ARM64-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; ARM64-NEXT:    [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]])
+; ARM64-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; ARM64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load float, ptr @input1_f32, align 16
+  %input0_1 = load float, ptr @input2_f32, align 16
+  %output0 = tail call float @llvm.minimumnum.f32(float %input0_0, float %input0_1)
+  store float %output0, ptr @output_f32, align 16
+  %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+  %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+  %output1 = tail call float @llvm.minimumnum.f32(float %input1_1, float %input1_2)
+  store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+  %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+  %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+  %output2 = tail call float @llvm.minimumnum.f32(float %input2_1, float %input2_2)
+  store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+  %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+  %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+  %output3 = tail call float @llvm.minimumnum.f32(float %input3_1, float %input3_2)
+  store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+  %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+  %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+  %output4 = tail call float @llvm.minimumnum.f32(float %input4_1, float %input4_2)
+  store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+  %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+  %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+  %output5 = tail call float @llvm.minimumnum.f32(float %input5_1, float %input5_2)
+  store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+  %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+  %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+  %output6 = tail call float @llvm.minimumnum.f32(float %input6_1, float %input6_2)
+  store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+  %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+  %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+  %output7 = tail call float @llvm.minimumnum.f32(float %input7_1, float %input7_2)
+  store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+  %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+  %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+  %output8 = tail call float @llvm.minimumnum.f32(float %input8_1, float %input8_2)
+  store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+  ret void
+}
+
+declare float @llvm.minimumnum.f32(float, float)
+
+define dso_local void @fmax32() local_unnamed_addr  {
+; ARM64-LABEL: define dso_local void @fmax32(
+; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; ARM64-NEXT:  [[ENTRY:.*:]]
+; ARM64-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; ARM64-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; ARM64-NEXT:    [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]])
+; ARM64-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; ARM64-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; ARM64-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; ARM64-NEXT:    [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]])
+; ARM64-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; ARM64-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; ARM64-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; ARM64-NEXT:    [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]])
+; ARM64-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; ARM64-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; ARM64-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; ARM64-NEXT:    [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]])
+; ARM64-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; ARM64-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; ARM64-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; ARM64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; ARM64-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; ARM64-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; ARM64-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; ARM64-NEXT:    [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]])
+; ARM64-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; ARM64-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; ARM64-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; ARM64-NEXT:    [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]])
+; ARM64-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; ARM64-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; ARM64-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; ARM64-NEXT:    [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]])
+; ARM64-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; ARM64-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; ARM64-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; ARM64-NEXT:    [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]])
+; ARM64-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; ARM64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load float, ptr @input1_f32, align 16
+  %input0_1 = load float, ptr @input2_f32, align 16
+  %output0 = tail call float @llvm.maximumnum.f32(float %input0_0, float %input0_1)
+  store float %output0, ptr @output_f32, align 16
+  %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+  %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+  %output1 = tail call float @llvm.maximumnum.f32(float %input1_1, float %input1_2)
+  store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+  %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+  %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+  %output2 = tail call float @llvm.maximumnum.f32(float %input2_1, float %input2_2)
+  store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+  %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+  %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+  %output3 = tail call float @llvm.maximumnum.f32(float %input3_1, float %input3_2)
+  store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+  %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+  %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+  %output4 = tail call float @llvm.maximumnum.f32(float %input4_1, float %input4_2)
+  store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+  %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+  %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+  %output5 = tail call float @llvm.maximumnum.f32(float %input5_1, float %input5_2)
+  store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+  %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+  %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+  %output6 = tail call float @llvm.maximumnum.f32(float %input6_1, float %input6_2)
+  store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+  %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+  %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+  %output7 = tail call float @llvm.maximumnum.f32(float %input7_1, float %input7_2)
+  store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+  %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+  %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+  %output8 = tail call float @llvm.maximumnum.f32(float %input8_1, float %input8_2)
+  store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+  ret void
+}
+
+declare float @llvm.maximumnum.f32(float, float)
+
+define dso_local void @fmin64() local_unnamed_addr  {
+; ARM64-LABEL: define dso_local void @fmin64(
+; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; ARM64-NEXT:  [[ENTRY:.*:]]
+; ARM64-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; ARM64-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; ARM64-NEXT:    [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]])
+; ARM64-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; ARM64-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; ARM64-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; ARM64-NEXT:    [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]])
+; ARM64-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; ARM64-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; ARM64-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; ARM64-NEXT:    [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]])
+; ARM64-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; ARM64-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; ARM64-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; ARM64-NEXT:    [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]])
+; ARM64-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; ARM64-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; ARM64-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; ARM64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; ARM64-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; ARM64-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; ARM64-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; ARM64-NEXT:    [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]])
+; ARM64-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; ARM64-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; ARM64-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; ARM64-NEXT:    [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]])
+; ARM64-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; ARM64-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; ARM64-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; ARM64-NEXT:    [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]])
+; ARM64-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; ARM64-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; ARM64-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; ARM64-NEXT:    [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]])
+; ARM64-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; ARM64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load double, ptr @input1_f64, align 16
+  %input0_1 = load double, ptr @input2_f64, align 16
+  %output0 = tail call double @llvm.minimumnum.f64(double %input0_0, double %input0_1)
+  store double %output0, ptr @output_f64, align 16
+  %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+  %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+  %output1 = tail call double @llvm.minimumnum.f64(double %input1_1, double %input1_2)
+  store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+  %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+  %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+  %output2 = tail call double @llvm.minimumnum.f64(double %input2_1, double %input2_2)
+  store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+  %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+  %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+  %output3 = tail call double @llvm.minimumnum.f64(double %input3_1, double %input3_2)
+  store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+  %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+  %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+  %output4 = tail call double @llvm.minimumnum.f64(double %input4_1, double %input4_2)
+  store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+  %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+  %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+  %output5 = tail call double @llvm.minimumnum.f64(double %input5_1, double %input5_2)
+  store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+  %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+  %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+  %output6 = tail call double @llvm.minimumnum.f64(double %input6_1, double %input6_2)
+  store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+  %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+  %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+  %output7 = tail call double @llvm.minimumnum.f64(double %input7_1, double %input7_2)
+  store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+  %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+  %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+  %output8 = tail call double @llvm.minimumnum.f64(double %input8_1, double %input8_2)
+  store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+  ret void
+}
+
+declare double @llvm.minimumnum.f64(double, double)
+
+define dso_local void @fmax64() local_unnamed_addr  {
+; ARM64-LABEL: define dso_local void @fmax64(
+; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; ARM64-NEXT:  [[ENTRY:.*:]]
+; ARM64-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; ARM64-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; ARM64-NEXT:    [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]])
+; ARM64-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; ARM64-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; ARM64-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; ARM64-NEXT:    [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]])
+; ARM64-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; ARM64-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; ARM64-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; ARM64-NEXT:    [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]])
+; ARM64-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; ARM64-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; ARM64-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; ARM64-NEXT:    [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]])
+; ARM64-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; ARM64-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; ARM64-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; ARM64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; ARM64-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; ARM64-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; ARM64-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; ARM64-NEXT:    [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]])
+; ARM64-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; ARM64-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; ARM64-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; ARM64-NEXT:    [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]])
+; ARM64-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; ARM64-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; ARM64-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; ARM64-NEXT:    [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]])
+; ARM64-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; ARM64-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; ARM64-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; ARM64-NEXT:    [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]])
+; ARM64-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; ARM64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load double, ptr @input1_f64, align 16
+  %input0_1 = load double, ptr @input2_f64, align 16
+  %output0 = tail call double @llvm.maximumnum.f64(double %input0_0, double %input0_1)
+  store double %output0, ptr @output_f64, align 16
+  %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+  %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+  %output1 = tail call double @llvm.maximumnum.f64(double %input1_1, double %input1_2)
+  store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+  %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+  %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+  %output2 = tail call double @llvm.maximumnum.f64(double %input2_1, double %input2_2)
+  store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+  %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+  %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+  %output3 = tail call double @llvm.maximumnum.f64(double %input3_1, double %input3_2)
+  store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+  %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+  %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+  %output4 = tail call double @llvm.maximumnum.f64(double %input4_1, double %input4_2)
+  store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+  %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+  %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+  %output5 = tail call double @llvm.maximumnum.f64(double %input5_1, double %input5_2)
+  store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+  %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+  %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+  %output6 = tail call double @llvm.maximumnum.f64(double %input6_1, double %input6_2)
+  store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+  %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+  %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+  %output7 = tail call double @llvm.maximumnum.f64(double %input7_1, double %input7_2)
+  store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+  %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+  %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+  %output8 = tail call double @llvm.maximumnum.f64(double %input8_1, double %input8_2)
+  store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+  ret void
+}
+
+declare double @llvm.maximumnum.f64(double, double)
+
+define dso_local void @fmin16() local_unnamed_addr  {
+; ARM64-LABEL: define dso_local void @fmin16(
+; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; ARM64-NEXT:  [[ENTRY:.*:]]
+; ARM64-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; ARM64-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; ARM64-NEXT:    [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]])
+; ARM64-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; ARM64-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; ARM64-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; ARM64-NEXT:    [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]])
+; ARM64-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; ARM64-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; ARM64-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; ARM64-NEXT:    [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]])
+; ARM64-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; ARM64-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; ARM64-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; ARM64-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
+; ARM64-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; ARM64-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; ARM64-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; ARM64-NEXT:    [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]])
+; ARM64-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; ARM64-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; ARM64-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; ARM64-NEXT:    [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]])
+; ARM64-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; ARM64-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; ARM64-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; ARM64-NEXT:    [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]])
+; ARM64-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; ARM64-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; ARM64-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; ARM64-NEXT:    [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]])
+; ARM64-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; ARM64-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; ARM64-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; ARM64-NEXT:    [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]])
+; ARM64-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; ARM64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load half, ptr @input1_f16, align 16
+  %input0_1 = load half, ptr @input2_f16, align 16
+  %output0 = tail call half @llvm.minimumnum.f16(half %input0_0, half %input0_1)
+  store half %output0, ptr @output_f16, align 16
+  %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+  %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+  %output1 = tail call half @llvm.minimumnum.f16(half %input1_1, half %input1_2)
+  store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+  %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+  %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+  %output2 = tail call half @llvm.minimumnum.f16(half %input2_1, half %input2_2)
+  store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+  %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+  %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+  %output3 = tail call half @llvm.minimumnum.f16(half %input3_1, half %input3_2)
+  store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+  %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+  %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+  %output4 = tail call half @llvm.minimumnum.f16(half %input4_1, half %input4_2)
+  store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+  %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+  %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+  %output5 = tail call half @llvm.minimumnum.f16(half %input5_1, half %input5_2)
+  store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+  %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+  %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+  %output6 = tail call half @llvm.minimumnum.f16(half %input6_1, half %input6_2)
+  store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+  %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+  %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+  %output7 = tail call half @llvm.minimumnum.f16(half %input7_1, half %input7_2)
+  store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+  %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+  %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+  %output8 = tail call half @llvm.minimumnum.f16(half %input8_1, half %input8_2)
+  store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+  ret void
+}
+
+declare half @llvm.minimumnum.f16(half, half)
+
+define dso_local void @fmax16() local_unnamed_addr  {
+; ARM64-LABEL: define dso_local void @fmax16(
+; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; ARM64-NEXT:  [[ENTRY:.*:]]
+; ARM64-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; ARM64-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; ARM64-NEXT:    [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]])
+; ARM64-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; ARM64-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; ARM64-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; ARM64-NEXT:    [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]])
+; ARM64-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; ARM64-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; ARM64-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; ARM64-NEXT:    [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]])
+; ARM64-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; ARM64-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; ARM64-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; ARM64-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
+; ARM64-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; ARM64-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; ARM64-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; ARM64-NEXT:    [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]])
+; ARM64-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; ARM64-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; ARM64-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; ARM64-NEXT:    [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]])
+; ARM64-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; ARM64-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; ARM64-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; ARM64-NEXT:    [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]])
+; ARM64-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; ARM64-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; ARM64-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; ARM64-NEXT:    [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]])
+; ARM64-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; ARM64-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; ARM64-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; ARM64-NEXT:    [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]])
+; ARM64-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; ARM64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load half, ptr @input1_f16, align 16
+  %input0_1 = load half, ptr @input2_f16, align 16
+  %output0 = tail call half @llvm.maximumnum.f16(half %input0_0, half %input0_1)
+  store half %output0, ptr @output_f16, align 16
+  %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+  %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+  %output1 = tail call half @llvm.maximumnum.f16(half %input1_1, half %input1_2)
+  store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+  %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+  %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+  %output2 = tail call half @llvm.maximumnum.f16(half %input2_1, half %input2_2)
+  store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+  %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+  %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+  %output3 = tail call half @llvm.maximumnum.f16(half %input3_1, half %input3_2)
+  store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+  %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+  %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+  %output4 = tail call half @llvm.maximumnum.f16(half %input4_1, half %input4_2)
+  store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+  %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+  %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+  %output5 = tail call half @llvm.maximumnum.f16(half %input5_1, half %input5_2)
+  store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+  %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+  %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+  %output6 = tail call half @llvm.maximumnum.f16(half %input6_1, half %input6_2)
+  store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+  %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+  %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+  %output7 = tail call half @llvm.maximumnum.f16(half %input7_1, half %input7_2)
+  store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+  %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+  %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+  %output8 = tail call half @llvm.maximumnum.f16(half %input8_1, half %input8_2)
+  store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+  ret void
+}
+
+declare half @llvm.maximumnum.f16(half, half)
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll
new file mode 100644
index 0000000000000..c972255d8127d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll
@@ -0,0 +1,516 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --passes=slp-vectorizer --mtriple=riscv64 -mattr="+zvfh,+v,+zfh" -S < %s | FileCheck %s --check-prefix=RV64
+
+ at input1_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
+ at input2_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
+ at output_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
+ at input1_f64 = dso_local local_unnamed_addr global [9 x double] zeroinitializer, align 16
+ at input2_f64 = dso_local local_unnamed_addr global [9 x double] zeroinitializer, align 16
+ at output_f64 = dso_local local_unnamed_addr global [9 x double] zeroinitializer, align 16
+ at input1_f16 = dso_local local_unnamed_addr global [9 x half] zeroinitializer, align 16
+ at input2_f16 = dso_local local_unnamed_addr global [9 x half] zeroinitializer, align 16
+ at output_f16 = dso_local local_unnamed_addr global [9 x half] zeroinitializer, align 16
+
+define dso_local void @fmin32() local_unnamed_addr  {
+; RV64-LABEL: define dso_local void @fmin32(
+; RV64-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; RV64-NEXT:  [[ENTRY:.*:]]
+; RV64-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; RV64-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; RV64-NEXT:    [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]])
+; RV64-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; RV64-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; RV64-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; RV64-NEXT:    [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]])
+; RV64-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; RV64-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; RV64-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; RV64-NEXT:    [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]])
+; RV64-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; RV64-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; RV64-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; RV64-NEXT:    [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]])
+; RV64-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; RV64-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; RV64-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; RV64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; RV64-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; RV64-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; RV64-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; RV64-NEXT:    [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]])
+; RV64-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; RV64-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; RV64-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; RV64-NEXT:    [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]])
+; RV64-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; RV64-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; RV64-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; RV64-NEXT:    [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]])
+; RV64-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; RV64-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; RV64-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; RV64-NEXT:    [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]])
+; RV64-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; RV64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load float, ptr @input1_f32, align 16
+  %input0_1 = load float, ptr @input2_f32, align 16
+  %output0 = tail call float @llvm.minimumnum.f32(float %input0_0, float %input0_1)
+  store float %output0, ptr @output_f32, align 16
+  %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+  %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+  %output1 = tail call float @llvm.minimumnum.f32(float %input1_1, float %input1_2)
+  store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+  %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+  %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+  %output2 = tail call float @llvm.minimumnum.f32(float %input2_1, float %input2_2)
+  store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+  %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+  %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+  %output3 = tail call float @llvm.minimumnum.f32(float %input3_1, float %input3_2)
+  store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+  %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+  %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+  %output4 = tail call float @llvm.minimumnum.f32(float %input4_1, float %input4_2)
+  store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+  %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+  %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+  %output5 = tail call float @llvm.minimumnum.f32(float %input5_1, float %input5_2)
+  store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+  %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+  %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+  %output6 = tail call float @llvm.minimumnum.f32(float %input6_1, float %input6_2)
+  store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+  %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+  %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+  %output7 = tail call float @llvm.minimumnum.f32(float %input7_1, float %input7_2)
+  store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+  %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+  %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+  %output8 = tail call float @llvm.minimumnum.f32(float %input8_1, float %input8_2)
+  store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+  ret void
+}
+
+declare float @llvm.minimumnum.f32(float, float)
+
+define dso_local void @fmax32() local_unnamed_addr  {
+; RV64-LABEL: define dso_local void @fmax32(
+; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*:]]
+; RV64-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; RV64-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; RV64-NEXT:    [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]])
+; RV64-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; RV64-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; RV64-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; RV64-NEXT:    [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]])
+; RV64-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; RV64-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; RV64-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; RV64-NEXT:    [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]])
+; RV64-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; RV64-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; RV64-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; RV64-NEXT:    [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]])
+; RV64-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; RV64-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; RV64-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; RV64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; RV64-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; RV64-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; RV64-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; RV64-NEXT:    [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]])
+; RV64-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; RV64-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; RV64-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; RV64-NEXT:    [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]])
+; RV64-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; RV64-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; RV64-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; RV64-NEXT:    [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]])
+; RV64-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; RV64-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; RV64-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; RV64-NEXT:    [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]])
+; RV64-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; RV64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load float, ptr @input1_f32, align 16
+  %input0_1 = load float, ptr @input2_f32, align 16
+  %output0 = tail call float @llvm.maximumnum.f32(float %input0_0, float %input0_1)
+  store float %output0, ptr @output_f32, align 16
+  %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+  %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+  %output1 = tail call float @llvm.maximumnum.f32(float %input1_1, float %input1_2)
+  store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+  %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+  %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+  %output2 = tail call float @llvm.maximumnum.f32(float %input2_1, float %input2_2)
+  store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+  %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+  %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+  %output3 = tail call float @llvm.maximumnum.f32(float %input3_1, float %input3_2)
+  store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+  %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+  %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+  %output4 = tail call float @llvm.maximumnum.f32(float %input4_1, float %input4_2)
+  store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+  %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+  %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+  %output5 = tail call float @llvm.maximumnum.f32(float %input5_1, float %input5_2)
+  store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+  %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+  %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+  %output6 = tail call float @llvm.maximumnum.f32(float %input6_1, float %input6_2)
+  store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+  %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+  %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+  %output7 = tail call float @llvm.maximumnum.f32(float %input7_1, float %input7_2)
+  store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+  %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+  %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+  %output8 = tail call float @llvm.maximumnum.f32(float %input8_1, float %input8_2)
+  store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+  ret void
+}
+
+declare float @llvm.maximumnum.f32(float, float)
+
+define dso_local void @fmin64() local_unnamed_addr  {
+; RV64-LABEL: define dso_local void @fmin64(
+; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*:]]
+; RV64-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; RV64-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; RV64-NEXT:    [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]])
+; RV64-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; RV64-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; RV64-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; RV64-NEXT:    [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]])
+; RV64-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; RV64-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; RV64-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; RV64-NEXT:    [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]])
+; RV64-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; RV64-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; RV64-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; RV64-NEXT:    [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]])
+; RV64-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; RV64-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; RV64-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; RV64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; RV64-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; RV64-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; RV64-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; RV64-NEXT:    [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]])
+; RV64-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; RV64-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; RV64-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; RV64-NEXT:    [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]])
+; RV64-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; RV64-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; RV64-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; RV64-NEXT:    [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]])
+; RV64-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; RV64-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; RV64-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; RV64-NEXT:    [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]])
+; RV64-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; RV64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load double, ptr @input1_f64, align 16
+  %input0_1 = load double, ptr @input2_f64, align 16
+  %output0 = tail call double @llvm.minimumnum.f64(double %input0_0, double %input0_1)
+  store double %output0, ptr @output_f64, align 16
+  %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+  %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+  %output1 = tail call double @llvm.minimumnum.f64(double %input1_1, double %input1_2)
+  store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+  %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+  %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+  %output2 = tail call double @llvm.minimumnum.f64(double %input2_1, double %input2_2)
+  store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+  %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+  %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+  %output3 = tail call double @llvm.minimumnum.f64(double %input3_1, double %input3_2)
+  store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+  %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+  %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+  %output4 = tail call double @llvm.minimumnum.f64(double %input4_1, double %input4_2)
+  store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+  %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+  %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+  %output5 = tail call double @llvm.minimumnum.f64(double %input5_1, double %input5_2)
+  store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+  %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+  %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+  %output6 = tail call double @llvm.minimumnum.f64(double %input6_1, double %input6_2)
+  store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+  %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+  %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+  %output7 = tail call double @llvm.minimumnum.f64(double %input7_1, double %input7_2)
+  store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+  %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+  %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+  %output8 = tail call double @llvm.minimumnum.f64(double %input8_1, double %input8_2)
+  store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+  ret void
+}
+
+declare double @llvm.minimumnum.f64(double, double)
+
+define dso_local void @fmax64() local_unnamed_addr  {
+; RV64-LABEL: define dso_local void @fmax64(
+; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*:]]
+; RV64-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; RV64-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; RV64-NEXT:    [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]])
+; RV64-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; RV64-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; RV64-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; RV64-NEXT:    [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]])
+; RV64-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; RV64-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; RV64-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; RV64-NEXT:    [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]])
+; RV64-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; RV64-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; RV64-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; RV64-NEXT:    [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]])
+; RV64-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; RV64-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; RV64-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; RV64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; RV64-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; RV64-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; RV64-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; RV64-NEXT:    [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]])
+; RV64-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; RV64-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; RV64-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; RV64-NEXT:    [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]])
+; RV64-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; RV64-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; RV64-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; RV64-NEXT:    [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]])
+; RV64-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; RV64-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; RV64-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; RV64-NEXT:    [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]])
+; RV64-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; RV64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load double, ptr @input1_f64, align 16
+  %input0_1 = load double, ptr @input2_f64, align 16
+  %output0 = tail call double @llvm.maximumnum.f64(double %input0_0, double %input0_1)
+  store double %output0, ptr @output_f64, align 16
+  %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+  %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+  %output1 = tail call double @llvm.maximumnum.f64(double %input1_1, double %input1_2)
+  store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+  %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+  %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+  %output2 = tail call double @llvm.maximumnum.f64(double %input2_1, double %input2_2)
+  store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+  %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+  %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+  %output3 = tail call double @llvm.maximumnum.f64(double %input3_1, double %input3_2)
+  store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+  %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+  %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+  %output4 = tail call double @llvm.maximumnum.f64(double %input4_1, double %input4_2)
+  store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+  %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+  %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+  %output5 = tail call double @llvm.maximumnum.f64(double %input5_1, double %input5_2)
+  store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+  %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+  %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+  %output6 = tail call double @llvm.maximumnum.f64(double %input6_1, double %input6_2)
+  store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+  %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+  %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+  %output7 = tail call double @llvm.maximumnum.f64(double %input7_1, double %input7_2)
+  store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+  %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+  %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+  %output8 = tail call double @llvm.maximumnum.f64(double %input8_1, double %input8_2)
+  store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+  ret void
+}
+
+declare double @llvm.maximumnum.f64(double, double)
+
+define dso_local void @fmin16() local_unnamed_addr  {
+; RV64-LABEL: define dso_local void @fmin16(
+; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*:]]
+; RV64-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; RV64-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; RV64-NEXT:    [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]])
+; RV64-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; RV64-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; RV64-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; RV64-NEXT:    [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]])
+; RV64-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; RV64-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; RV64-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; RV64-NEXT:    [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]])
+; RV64-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; RV64-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; RV64-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; RV64-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
+; RV64-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; RV64-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; RV64-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; RV64-NEXT:    [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]])
+; RV64-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; RV64-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; RV64-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; RV64-NEXT:    [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]])
+; RV64-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; RV64-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; RV64-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; RV64-NEXT:    [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]])
+; RV64-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; RV64-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; RV64-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; RV64-NEXT:    [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]])
+; RV64-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; RV64-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; RV64-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; RV64-NEXT:    [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]])
+; RV64-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; RV64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load half, ptr @input1_f16, align 16
+  %input0_1 = load half, ptr @input2_f16, align 16
+  %output0 = tail call half @llvm.minimumnum.f16(half %input0_0, half %input0_1)
+  store half %output0, ptr @output_f16, align 16
+  %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+  %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+  %output1 = tail call half @llvm.minimumnum.f16(half %input1_1, half %input1_2)
+  store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+  %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+  %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+  %output2 = tail call half @llvm.minimumnum.f16(half %input2_1, half %input2_2)
+  store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+  %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+  %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+  %output3 = tail call half @llvm.minimumnum.f16(half %input3_1, half %input3_2)
+  store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+  %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+  %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+  %output4 = tail call half @llvm.minimumnum.f16(half %input4_1, half %input4_2)
+  store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+  %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+  %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+  %output5 = tail call half @llvm.minimumnum.f16(half %input5_1, half %input5_2)
+  store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+  %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+  %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+  %output6 = tail call half @llvm.minimumnum.f16(half %input6_1, half %input6_2)
+  store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+  %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+  %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+  %output7 = tail call half @llvm.minimumnum.f16(half %input7_1, half %input7_2)
+  store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+  %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+  %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+  %output8 = tail call half @llvm.minimumnum.f16(half %input8_1, half %input8_2)
+  store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+  ret void
+}
+
+declare half @llvm.minimumnum.f16(half, half)
+
+define dso_local void @fmax16() local_unnamed_addr  {
+; RV64-LABEL: define dso_local void @fmax16(
+; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*:]]
+; RV64-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; RV64-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; RV64-NEXT:    [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]])
+; RV64-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; RV64-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; RV64-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; RV64-NEXT:    [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]])
+; RV64-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; RV64-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; RV64-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; RV64-NEXT:    [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]])
+; RV64-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; RV64-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; RV64-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; RV64-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
+; RV64-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; RV64-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; RV64-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; RV64-NEXT:    [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]])
+; RV64-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; RV64-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; RV64-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; RV64-NEXT:    [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]])
+; RV64-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; RV64-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; RV64-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; RV64-NEXT:    [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]])
+; RV64-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; RV64-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; RV64-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; RV64-NEXT:    [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]])
+; RV64-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; RV64-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; RV64-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; RV64-NEXT:    [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]])
+; RV64-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; RV64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load half, ptr @input1_f16, align 16
+  %input0_1 = load half, ptr @input2_f16, align 16
+  %output0 = tail call half @llvm.maximumnum.f16(half %input0_0, half %input0_1)
+  store half %output0, ptr @output_f16, align 16
+  %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+  %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+  %output1 = tail call half @llvm.maximumnum.f16(half %input1_1, half %input1_2)
+  store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+  %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+  %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+  %output2 = tail call half @llvm.maximumnum.f16(half %input2_1, half %input2_2)
+  store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+  %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+  %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+  %output3 = tail call half @llvm.maximumnum.f16(half %input3_1, half %input3_2)
+  store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+  %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+  %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+  %output4 = tail call half @llvm.maximumnum.f16(half %input4_1, half %input4_2)
+  store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+  %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+  %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+  %output5 = tail call half @llvm.maximumnum.f16(half %input5_1, half %input5_2)
+  store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+  %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+  %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+  %output6 = tail call half @llvm.maximumnum.f16(half %input6_1, half %input6_2)
+  store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+  %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+  %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+  %output7 = tail call half @llvm.maximumnum.f16(half %input7_1, half %input7_2)
+  store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+  %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+  %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+  %output8 = tail call half @llvm.maximumnum.f16(half %input8_1, half %input8_2)
+  store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+  ret void
+}
+
+declare half @llvm.maximumnum.f16(half, half)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll
new file mode 100644
index 0000000000000..9f8783baea4a9
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll
@@ -0,0 +1,510 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --passes=slp-vectorizer --mtriple=x86_64 -S < %s | FileCheck %s --check-prefix=X64
+
+ at input1_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
+ at input2_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
+ at output_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
+ at input1_f64 = dso_local local_unnamed_addr global [9 x double] zeroinitializer, align 16
+ at input2_f64 = dso_local local_unnamed_addr global [9 x double] zeroinitializer, align 16
+ at output_f64 = dso_local local_unnamed_addr global [9 x double] zeroinitializer, align 16
+ at input1_f16 = dso_local local_unnamed_addr global [9 x half] zeroinitializer, align 16
+ at input2_f16 = dso_local local_unnamed_addr global [9 x half] zeroinitializer, align 16
+ at output_f16 = dso_local local_unnamed_addr global [9 x half] zeroinitializer, align 16
+
+define dso_local void @fmin32() local_unnamed_addr  {
+; X64-LABEL: define dso_local void @fmin32() local_unnamed_addr {
+; X64-NEXT:  [[ENTRY:.*:]]
+; X64-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; X64-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; X64-NEXT:    [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]])
+; X64-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; X64-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; X64-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; X64-NEXT:    [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]])
+; X64-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; X64-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; X64-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; X64-NEXT:    [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]])
+; X64-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; X64-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; X64-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; X64-NEXT:    [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]])
+; X64-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; X64-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; X64-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; X64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; X64-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; X64-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; X64-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; X64-NEXT:    [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]])
+; X64-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; X64-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; X64-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; X64-NEXT:    [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]])
+; X64-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; X64-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; X64-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; X64-NEXT:    [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]])
+; X64-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; X64-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; X64-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; X64-NEXT:    [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]])
+; X64-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; X64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load float, ptr @input1_f32, align 16
+  %input0_1 = load float, ptr @input2_f32, align 16
+  %output0 = tail call float @llvm.minimumnum.f32(float %input0_0, float %input0_1)
+  store float %output0, ptr @output_f32, align 16
+  %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+  %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+  %output1 = tail call float @llvm.minimumnum.f32(float %input1_1, float %input1_2)
+  store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+  %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+  %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+  %output2 = tail call float @llvm.minimumnum.f32(float %input2_1, float %input2_2)
+  store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+  %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+  %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+  %output3 = tail call float @llvm.minimumnum.f32(float %input3_1, float %input3_2)
+  store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+  %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+  %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+  %output4 = tail call float @llvm.minimumnum.f32(float %input4_1, float %input4_2)
+  store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+  %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+  %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+  %output5 = tail call float @llvm.minimumnum.f32(float %input5_1, float %input5_2)
+  store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+  %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+  %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+  %output6 = tail call float @llvm.minimumnum.f32(float %input6_1, float %input6_2)
+  store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+  %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+  %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+  %output7 = tail call float @llvm.minimumnum.f32(float %input7_1, float %input7_2)
+  store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+  %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+  %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+  %output8 = tail call float @llvm.minimumnum.f32(float %input8_1, float %input8_2)
+  store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+  ret void
+}
+
+declare float @llvm.minimumnum.f32(float, float)
+
+define dso_local void @fmax32() local_unnamed_addr  {
+; X64-LABEL: define dso_local void @fmax32() local_unnamed_addr {
+; X64-NEXT:  [[ENTRY:.*:]]
+; X64-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; X64-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; X64-NEXT:    [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]])
+; X64-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; X64-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; X64-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; X64-NEXT:    [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]])
+; X64-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; X64-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; X64-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; X64-NEXT:    [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]])
+; X64-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; X64-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; X64-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; X64-NEXT:    [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]])
+; X64-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; X64-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; X64-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; X64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; X64-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; X64-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; X64-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; X64-NEXT:    [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]])
+; X64-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; X64-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; X64-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; X64-NEXT:    [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]])
+; X64-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; X64-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; X64-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; X64-NEXT:    [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]])
+; X64-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; X64-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; X64-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; X64-NEXT:    [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]])
+; X64-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; X64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load float, ptr @input1_f32, align 16
+  %input0_1 = load float, ptr @input2_f32, align 16
+  %output0 = tail call float @llvm.maximumnum.f32(float %input0_0, float %input0_1)
+  store float %output0, ptr @output_f32, align 16
+  %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+  %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+  %output1 = tail call float @llvm.maximumnum.f32(float %input1_1, float %input1_2)
+  store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+  %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+  %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+  %output2 = tail call float @llvm.maximumnum.f32(float %input2_1, float %input2_2)
+  store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+  %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+  %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+  %output3 = tail call float @llvm.maximumnum.f32(float %input3_1, float %input3_2)
+  store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+  %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+  %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+  %output4 = tail call float @llvm.maximumnum.f32(float %input4_1, float %input4_2)
+  store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+  %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+  %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+  %output5 = tail call float @llvm.maximumnum.f32(float %input5_1, float %input5_2)
+  store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+  %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+  %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+  %output6 = tail call float @llvm.maximumnum.f32(float %input6_1, float %input6_2)
+  store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+  %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+  %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+  %output7 = tail call float @llvm.maximumnum.f32(float %input7_1, float %input7_2)
+  store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+  %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+  %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+  %output8 = tail call float @llvm.maximumnum.f32(float %input8_1, float %input8_2)
+  store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+  ret void
+}
+
+declare float @llvm.maximumnum.f32(float, float)
+
+define dso_local void @fmin64() local_unnamed_addr  {
+; X64-LABEL: define dso_local void @fmin64() local_unnamed_addr {
+; X64-NEXT:  [[ENTRY:.*:]]
+; X64-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; X64-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; X64-NEXT:    [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]])
+; X64-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; X64-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; X64-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; X64-NEXT:    [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]])
+; X64-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; X64-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; X64-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; X64-NEXT:    [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]])
+; X64-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; X64-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; X64-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; X64-NEXT:    [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]])
+; X64-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; X64-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; X64-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; X64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; X64-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; X64-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; X64-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; X64-NEXT:    [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]])
+; X64-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; X64-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; X64-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; X64-NEXT:    [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]])
+; X64-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; X64-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; X64-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; X64-NEXT:    [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]])
+; X64-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; X64-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; X64-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; X64-NEXT:    [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]])
+; X64-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; X64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load double, ptr @input1_f64, align 16
+  %input0_1 = load double, ptr @input2_f64, align 16
+  %output0 = tail call double @llvm.minimumnum.f64(double %input0_0, double %input0_1)
+  store double %output0, ptr @output_f64, align 16
+  %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+  %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+  %output1 = tail call double @llvm.minimumnum.f64(double %input1_1, double %input1_2)
+  store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+  %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+  %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+  %output2 = tail call double @llvm.minimumnum.f64(double %input2_1, double %input2_2)
+  store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+  %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+  %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+  %output3 = tail call double @llvm.minimumnum.f64(double %input3_1, double %input3_2)
+  store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+  %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+  %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+  %output4 = tail call double @llvm.minimumnum.f64(double %input4_1, double %input4_2)
+  store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+  %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+  %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+  %output5 = tail call double @llvm.minimumnum.f64(double %input5_1, double %input5_2)
+  store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+  %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+  %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+  %output6 = tail call double @llvm.minimumnum.f64(double %input6_1, double %input6_2)
+  store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+  %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+  %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+  %output7 = tail call double @llvm.minimumnum.f64(double %input7_1, double %input7_2)
+  store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+  %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+  %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+  %output8 = tail call double @llvm.minimumnum.f64(double %input8_1, double %input8_2)
+  store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+  ret void
+}
+
+declare double @llvm.minimumnum.f64(double, double)
+
+define dso_local void @fmax64() local_unnamed_addr  {
+; X64-LABEL: define dso_local void @fmax64() local_unnamed_addr {
+; X64-NEXT:  [[ENTRY:.*:]]
+; X64-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; X64-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; X64-NEXT:    [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]])
+; X64-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; X64-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; X64-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; X64-NEXT:    [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]])
+; X64-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; X64-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; X64-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; X64-NEXT:    [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]])
+; X64-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; X64-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; X64-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; X64-NEXT:    [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]])
+; X64-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; X64-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; X64-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; X64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; X64-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; X64-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; X64-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; X64-NEXT:    [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]])
+; X64-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; X64-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; X64-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; X64-NEXT:    [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]])
+; X64-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; X64-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; X64-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; X64-NEXT:    [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]])
+; X64-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; X64-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; X64-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; X64-NEXT:    [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]])
+; X64-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; X64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load double, ptr @input1_f64, align 16
+  %input0_1 = load double, ptr @input2_f64, align 16
+  %output0 = tail call double @llvm.maximumnum.f64(double %input0_0, double %input0_1)
+  store double %output0, ptr @output_f64, align 16
+  %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+  %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+  %output1 = tail call double @llvm.maximumnum.f64(double %input1_1, double %input1_2)
+  store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+  %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+  %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+  %output2 = tail call double @llvm.maximumnum.f64(double %input2_1, double %input2_2)
+  store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+  %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+  %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+  %output3 = tail call double @llvm.maximumnum.f64(double %input3_1, double %input3_2)
+  store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+  %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+  %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+  %output4 = tail call double @llvm.maximumnum.f64(double %input4_1, double %input4_2)
+  store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+  %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+  %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+  %output5 = tail call double @llvm.maximumnum.f64(double %input5_1, double %input5_2)
+  store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+  %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+  %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+  %output6 = tail call double @llvm.maximumnum.f64(double %input6_1, double %input6_2)
+  store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+  %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+  %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+  %output7 = tail call double @llvm.maximumnum.f64(double %input7_1, double %input7_2)
+  store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+  %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+  %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+  %output8 = tail call double @llvm.maximumnum.f64(double %input8_1, double %input8_2)
+  store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+  ret void
+}
+
+declare double @llvm.maximumnum.f64(double, double)
+
+define dso_local void @fmin16() local_unnamed_addr  {
+; X64-LABEL: define dso_local void @fmin16() local_unnamed_addr {
+; X64-NEXT:  [[ENTRY:.*:]]
+; X64-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; X64-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; X64-NEXT:    [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]])
+; X64-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; X64-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; X64-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; X64-NEXT:    [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]])
+; X64-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; X64-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; X64-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; X64-NEXT:    [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]])
+; X64-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; X64-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; X64-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; X64-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
+; X64-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; X64-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; X64-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; X64-NEXT:    [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]])
+; X64-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; X64-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; X64-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; X64-NEXT:    [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]])
+; X64-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; X64-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; X64-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; X64-NEXT:    [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]])
+; X64-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; X64-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; X64-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; X64-NEXT:    [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]])
+; X64-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; X64-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; X64-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; X64-NEXT:    [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]])
+; X64-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; X64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load half, ptr @input1_f16, align 16
+  %input0_1 = load half, ptr @input2_f16, align 16
+  %output0 = tail call half @llvm.minimumnum.f16(half %input0_0, half %input0_1)
+  store half %output0, ptr @output_f16, align 16
+  %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+  %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+  %output1 = tail call half @llvm.minimumnum.f16(half %input1_1, half %input1_2)
+  store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+  %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+  %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+  %output2 = tail call half @llvm.minimumnum.f16(half %input2_1, half %input2_2)
+  store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+  %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+  %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+  %output3 = tail call half @llvm.minimumnum.f16(half %input3_1, half %input3_2)
+  store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+  %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+  %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+  %output4 = tail call half @llvm.minimumnum.f16(half %input4_1, half %input4_2)
+  store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+  %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+  %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+  %output5 = tail call half @llvm.minimumnum.f16(half %input5_1, half %input5_2)
+  store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+  %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+  %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+  %output6 = tail call half @llvm.minimumnum.f16(half %input6_1, half %input6_2)
+  store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+  %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+  %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+  %output7 = tail call half @llvm.minimumnum.f16(half %input7_1, half %input7_2)
+  store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+  %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+  %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+  %output8 = tail call half @llvm.minimumnum.f16(half %input8_1, half %input8_2)
+  store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+  ret void
+}
+
+declare half @llvm.minimumnum.f16(half, half)
+
+define dso_local void @fmax16() local_unnamed_addr  {
+; X64-LABEL: define dso_local void @fmax16() local_unnamed_addr {
+; X64-NEXT:  [[ENTRY:.*:]]
+; X64-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; X64-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; X64-NEXT:    [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]])
+; X64-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; X64-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; X64-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; X64-NEXT:    [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]])
+; X64-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; X64-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; X64-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; X64-NEXT:    [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]])
+; X64-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; X64-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; X64-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; X64-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
+; X64-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; X64-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; X64-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; X64-NEXT:    [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]])
+; X64-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; X64-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; X64-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; X64-NEXT:    [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]])
+; X64-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; X64-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; X64-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; X64-NEXT:    [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]])
+; X64-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; X64-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; X64-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; X64-NEXT:    [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]])
+; X64-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; X64-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; X64-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; X64-NEXT:    [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]])
+; X64-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; X64-NEXT:    ret void
+;
+entry:
+  %input0_0 = load half, ptr @input1_f16, align 16
+  %input0_1 = load half, ptr @input2_f16, align 16
+  %output0 = tail call half @llvm.maximumnum.f16(half %input0_0, half %input0_1)
+  store half %output0, ptr @output_f16, align 16
+  %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+  %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+  %output1 = tail call half @llvm.maximumnum.f16(half %input1_1, half %input1_2)
+  store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+  %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+  %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+  %output2 = tail call half @llvm.maximumnum.f16(half %input2_1, half %input2_2)
+  store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+  %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+  %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+  %output3 = tail call half @llvm.maximumnum.f16(half %input3_1, half %input3_2)
+  store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+  %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+  %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+  %output4 = tail call half @llvm.maximumnum.f16(half %input4_1, half %input4_2)
+  store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+  %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+  %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+  %output5 = tail call half @llvm.maximumnum.f16(half %input5_1, half %input5_2)
+  store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+  %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+  %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+  %output6 = tail call half @llvm.maximumnum.f16(half %input6_1, half %input6_2)
+  store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+  %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+  %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+  %output7 = tail call half @llvm.maximumnum.f16(half %input7_1, half %input7_2)
+  store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+  %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+  %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+  %output8 = tail call half @llvm.maximumnum.f16(half %input8_1, half %input8_2)
+  store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+  ret void
+}
+
+declare half @llvm.maximumnum.f16(half, half)

>From 6c2a529d98d2690479e3eb02c69247621c051c71 Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqiang at isrc.iscas.ac.cn>
Date: Tue, 1 Apr 2025 12:35:43 +0800
Subject: [PATCH 2/2] Use default check prefix

---
 .../LoopVectorize/AArch64/fminimumnum.ll      | 218 ++++----
 .../LoopVectorize/RISCV/fminimumnum.ll        | 218 ++++----
 .../LoopVectorize/X86/fminimumnum.ll          | 206 ++++----
 .../SLPVectorizer/AArch64/fminimumnum.ll      | 482 +++++++++---------
 .../SLPVectorizer/RISCV/fminimumnum.ll        | 482 +++++++++---------
 .../SLPVectorizer/X86/fminimumnum.ll          | 470 ++++++++---------
 6 files changed, 1038 insertions(+), 1038 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll
index 16968b3d11420..5453f9ddd8f97 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet.
-; RUN: opt --passes=loop-vectorize --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s --check-prefix=ARM64
+; RUN: opt --passes=loop-vectorize --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s
 
 @input1_f32 = global [4096 x float] zeroinitializer, align 4
 @input2_f32 = global [4096 x float] zeroinitializer, align 4
@@ -13,24 +13,24 @@
 @output_f16 = global [4096 x half] zeroinitializer, align 2
 
 define void @f32min()  {
-; ARM64-LABEL: define void @f32min(
-; ARM64-SAME: ) #[[ATTR0:[0-9]+]] {
-; ARM64-NEXT:  [[ENTRY:.*]]:
-; ARM64-NEXT:    br label %[[FOR_BODY:.*]]
-; ARM64:       [[FOR_COND_CLEANUP:.*]]:
-; ARM64-NEXT:    ret void
-; ARM64:       [[FOR_BODY]]:
-; ARM64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; ARM64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; ARM64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; ARM64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
-; ARM64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
-; ARM64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; ARM64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; ARM64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f32min(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -55,24 +55,24 @@ for.body:                                         ; preds = %entry, %for.body
 declare float @llvm.minimumnum.f32(float, float)
 
 define void @f32max()  {
-; ARM64-LABEL: define void @f32max(
-; ARM64-SAME: ) #[[ATTR0]] {
-; ARM64-NEXT:  [[ENTRY:.*]]:
-; ARM64-NEXT:    br label %[[FOR_BODY:.*]]
-; ARM64:       [[FOR_COND_CLEANUP:.*]]:
-; ARM64-NEXT:    ret void
-; ARM64:       [[FOR_BODY]]:
-; ARM64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; ARM64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; ARM64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; ARM64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
-; ARM64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
-; ARM64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; ARM64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; ARM64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f32max(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -97,24 +97,24 @@ for.body:                                         ; preds = %entry, %for.body
 declare float @llvm.maximumnum.f32(float, float)
 
 define void @f64min()  {
-; ARM64-LABEL: define void @f64min(
-; ARM64-SAME: ) #[[ATTR0]] {
-; ARM64-NEXT:  [[ENTRY:.*]]:
-; ARM64-NEXT:    br label %[[FOR_BODY:.*]]
-; ARM64:       [[FOR_COND_CLEANUP:.*]]:
-; ARM64-NEXT:    ret void
-; ARM64:       [[FOR_BODY]]:
-; ARM64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; ARM64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; ARM64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
-; ARM64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
-; ARM64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
-; ARM64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; ARM64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; ARM64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f64min(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -139,24 +139,24 @@ for.body:                                         ; preds = %entry, %for.body
 declare double @llvm.minimumnum.f64(double, double)
 
 define void @f64max()  {
-; ARM64-LABEL: define void @f64max(
-; ARM64-SAME: ) #[[ATTR0]] {
-; ARM64-NEXT:  [[ENTRY:.*]]:
-; ARM64-NEXT:    br label %[[FOR_BODY:.*]]
-; ARM64:       [[FOR_COND_CLEANUP:.*]]:
-; ARM64-NEXT:    ret void
-; ARM64:       [[FOR_BODY]]:
-; ARM64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; ARM64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; ARM64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
-; ARM64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
-; ARM64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
-; ARM64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; ARM64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; ARM64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f64max(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -181,24 +181,24 @@ for.body:                                         ; preds = %entry, %for.body
 declare double @llvm.maximumnum.f64(double, double)
 
 define void @f16min()  {
-; ARM64-LABEL: define void @f16min(
-; ARM64-SAME: ) #[[ATTR0]] {
-; ARM64-NEXT:  [[ENTRY:.*]]:
-; ARM64-NEXT:    br label %[[FOR_BODY:.*]]
-; ARM64:       [[FOR_COND_CLEANUP:.*]]:
-; ARM64-NEXT:    ret void
-; ARM64:       [[FOR_BODY]]:
-; ARM64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; ARM64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
-; ARM64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
-; ARM64-NEXT:    [[TMP10:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP8]], half [[TMP9]])
-; ARM64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
-; ARM64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; ARM64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; ARM64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f16min(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP8]], half [[TMP9]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -223,24 +223,24 @@ for.body:                                         ; preds = %entry, %for.body
 declare half @llvm.minimumnum.f16(half, half)
 
 define void @f16max()  {
-; ARM64-LABEL: define void @f16max(
-; ARM64-SAME: ) #[[ATTR0]] {
-; ARM64-NEXT:  [[ENTRY:.*]]:
-; ARM64-NEXT:    br label %[[FOR_BODY:.*]]
-; ARM64:       [[FOR_COND_CLEANUP:.*]]:
-; ARM64-NEXT:    ret void
-; ARM64:       [[FOR_BODY]]:
-; ARM64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; ARM64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
-; ARM64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
-; ARM64-NEXT:    [[TMP10:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP8]], half [[TMP9]])
-; ARM64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
-; ARM64-NEXT:    store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
-; ARM64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; ARM64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; ARM64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f16max(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP8]], half [[TMP9]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
index 1ff43856a8bc1..6bb560465049d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet.
-; RUN: opt --passes=loop-vectorize --mtriple=riscv64 -mattr="+zvfh,+v,+zfh" -S < %s | FileCheck %s --check-prefix=RV64
+; RUN: opt --passes=loop-vectorize --mtriple=riscv64 -mattr="+zvfh,+v,+zfh" -S < %s | FileCheck %s
 
 @input1_f32 = global [4096 x float] zeroinitializer, align 4
 @input2_f32 = global [4096 x float] zeroinitializer, align 4
@@ -13,24 +13,24 @@
 @output_f16 = global [4096 x half] zeroinitializer, align 2
 
 define void @f32min()  {
-; RV64-LABEL: define void @f32min(
-; RV64-SAME: ) #[[ATTR0:[0-9]+]] {
-; RV64-NEXT:  [[ENTRY:.*]]:
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_COND_CLEANUP:.*]]:
-; RV64-NEXT:    ret void
-; RV64:       [[FOR_BODY]]:
-; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; RV64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; RV64-NEXT:    [[TMP16:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP14]], float [[TMP15]])
-; RV64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    store float [[TMP16]], ptr [[ARRAYIDX4]], align 4
-; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; RV64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; RV64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f32min(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP14]], float [[TMP15]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP16]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -55,24 +55,24 @@ for.body:                                         ; preds = %entry, %for.body
 declare float @llvm.minimumnum.f32(float, float)
 
 define void @f32max()  {
-; RV64-LABEL: define void @f32max(
-; RV64-SAME: ) #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*]]:
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_COND_CLEANUP:.*]]:
-; RV64-NEXT:    ret void
-; RV64:       [[FOR_BODY]]:
-; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; RV64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; RV64-NEXT:    [[TMP16:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP14]], float [[TMP15]])
-; RV64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    store float [[TMP16]], ptr [[ARRAYIDX4]], align 4
-; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; RV64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; RV64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f32max(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP14]], float [[TMP15]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP16]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -97,24 +97,24 @@ for.body:                                         ; preds = %entry, %for.body
 declare float @llvm.maximumnum.f32(float, float)
 
 define void @f64min()  {
-; RV64-LABEL: define void @f64min(
-; RV64-SAME: ) #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*]]:
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_COND_CLEANUP:.*]]:
-; RV64-NEXT:    ret void
-; RV64:       [[FOR_BODY]]:
-; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; RV64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
-; RV64-NEXT:    [[TMP16:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP14]], double [[TMP15]])
-; RV64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    store double [[TMP16]], ptr [[ARRAYIDX4]], align 8
-; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; RV64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; RV64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f64min(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP14]], double [[TMP15]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[TMP16]], ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -139,24 +139,24 @@ for.body:                                         ; preds = %entry, %for.body
 declare double @llvm.minimumnum.f64(double, double)
 
 define void @f64max()  {
-; RV64-LABEL: define void @f64max(
-; RV64-SAME: ) #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*]]:
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_COND_CLEANUP:.*]]:
-; RV64-NEXT:    ret void
-; RV64:       [[FOR_BODY]]:
-; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; RV64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
-; RV64-NEXT:    [[TMP16:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP14]], double [[TMP15]])
-; RV64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    store double [[TMP16]], ptr [[ARRAYIDX4]], align 8
-; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; RV64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; RV64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f64max(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP14]], double [[TMP15]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[TMP16]], ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -181,24 +181,24 @@ for.body:                                         ; preds = %entry, %for.body
 declare double @llvm.maximumnum.f64(double, double)
 
 define void @f16min()  {
-; RV64-LABEL: define void @f16min(
-; RV64-SAME: ) #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*]]:
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_COND_CLEANUP:.*]]:
-; RV64-NEXT:    ret void
-; RV64:       [[FOR_BODY]]:
-; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP14:%.*]] = load half, ptr [[ARRAYIDX]], align 2
-; RV64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP15:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
-; RV64-NEXT:    [[TMP16:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP14]], half [[TMP15]])
-; RV64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    store half [[TMP16]], ptr [[ARRAYIDX4]], align 2
-; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; RV64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; RV64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f16min(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP14]], half [[TMP15]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store half [[TMP16]], ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -223,24 +223,24 @@ for.body:                                         ; preds = %entry, %for.body
 declare half @llvm.minimumnum.f16(half, half)
 
 define void @f16max()  {
-; RV64-LABEL: define void @f16max(
-; RV64-SAME: ) #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*]]:
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_COND_CLEANUP:.*]]:
-; RV64-NEXT:    ret void
-; RV64:       [[FOR_BODY]]:
-; RV64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP14:%.*]] = load half, ptr [[ARRAYIDX]], align 2
-; RV64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    [[TMP15:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
-; RV64-NEXT:    [[TMP16:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP14]], half [[TMP15]])
-; RV64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
-; RV64-NEXT:    store half [[TMP16]], ptr [[ARRAYIDX4]], align 2
-; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; RV64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; RV64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f16max(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP14]], half [[TMP15]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store half [[TMP16]], ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll
index 8f2cd266b6bda..9b34a6941e86c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet.
-; RUN: opt --passes=loop-vectorize --mtriple=x86_64 -S < %s | FileCheck %s --check-prefix=X64
+; RUN: opt --passes=loop-vectorize --mtriple=x86_64 -S < %s | FileCheck %s
 
 @input1_f32 = global [4096 x float] zeroinitializer, align 4
 @input2_f32 = global [4096 x float] zeroinitializer, align 4
@@ -13,23 +13,23 @@
 @output_f16 = global [4096 x half] zeroinitializer, align 2
 
 define void @f32min()  {
-; X64-LABEL: define void @f32min() {
-; X64-NEXT:  [[ENTRY:.*]]:
-; X64-NEXT:    br label %[[FOR_BODY:.*]]
-; X64:       [[FOR_COND_CLEANUP:.*]]:
-; X64-NEXT:    ret void
-; X64:       [[FOR_BODY]]:
-; X64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; X64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; X64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; X64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
-; X64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
-; X64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; X64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; X64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f32min() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -54,23 +54,23 @@ for.body:                                         ; preds = %entry, %for.body
 declare float @llvm.minimumnum.f32(float, float)
 
 define void @f32max()  {
-; X64-LABEL: define void @f32max() {
-; X64-NEXT:  [[ENTRY:.*]]:
-; X64-NEXT:    br label %[[FOR_BODY:.*]]
-; X64:       [[FOR_COND_CLEANUP:.*]]:
-; X64-NEXT:    ret void
-; X64:       [[FOR_BODY]]:
-; X64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; X64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; X64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; X64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
-; X64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
-; X64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; X64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; X64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f32max() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP14]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -95,23 +95,23 @@ for.body:                                         ; preds = %entry, %for.body
 declare float @llvm.maximumnum.f32(float, float)
 
 define void @f64min()  {
-; X64-LABEL: define void @f64min() {
-; X64-NEXT:  [[ENTRY:.*]]:
-; X64-NEXT:    br label %[[FOR_BODY:.*]]
-; X64:       [[FOR_COND_CLEANUP:.*]]:
-; X64-NEXT:    ret void
-; X64:       [[FOR_BODY]]:
-; X64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; X64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; X64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
-; X64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
-; X64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
-; X64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; X64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; X64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f64min() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -136,23 +136,23 @@ for.body:                                         ; preds = %entry, %for.body
 declare double @llvm.minimumnum.f64(double, double)
 
 define void @f64max()  {
-; X64-LABEL: define void @f64max() {
-; X64-NEXT:  [[ENTRY:.*]]:
-; X64-NEXT:    br label %[[FOR_BODY:.*]]
-; X64:       [[FOR_COND_CLEANUP:.*]]:
-; X64-NEXT:    ret void
-; X64:       [[FOR_BODY]]:
-; X64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; X64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; X64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
-; X64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
-; X64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
-; X64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; X64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; X64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f64max() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[TMP14]], ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -177,23 +177,23 @@ for.body:                                         ; preds = %entry, %for.body
 declare double @llvm.maximumnum.f64(double, double)
 
 define void @f16min()  {
-; X64-LABEL: define void @f16min() {
-; X64-NEXT:  [[ENTRY:.*]]:
-; X64-NEXT:    br label %[[FOR_BODY:.*]]
-; X64:       [[FOR_COND_CLEANUP:.*]]:
-; X64-NEXT:    ret void
-; X64:       [[FOR_BODY]]:
-; X64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; X64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
-; X64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
-; X64-NEXT:    [[TMP10:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP8]], half [[TMP9]])
-; X64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
-; X64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; X64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; X64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f16min() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP8]], half [[TMP9]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -218,23 +218,23 @@ for.body:                                         ; preds = %entry, %for.body
 declare half @llvm.minimumnum.f16(half, half)
 
 define void @f16max()  {
-; X64-LABEL: define void @f16max() {
-; X64-NEXT:  [[ENTRY:.*]]:
-; X64-NEXT:    br label %[[FOR_BODY:.*]]
-; X64:       [[FOR_COND_CLEANUP:.*]]:
-; X64-NEXT:    ret void
-; X64:       [[FOR_BODY]]:
-; X64-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; X64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
-; X64-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
-; X64-NEXT:    [[TMP10:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP8]], half [[TMP9]])
-; X64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
-; X64-NEXT:    store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
-; X64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; X64-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
-; X64-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-LABEL: define void @f16max() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP8]], half [[TMP9]])
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store half [[TMP10]], ptr [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll
index 4acc011df8b18..9cf77f3ed1f16 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt --passes=slp-vectorizer --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s --check-prefix=ARM64
+; RUN: opt --passes=slp-vectorizer --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s
 
 @input1_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
 @input2_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
@@ -12,46 +12,46 @@
 @output_f16 = dso_local local_unnamed_addr global [9 x half] zeroinitializer, align 16
 
 define dso_local void @fmin32() local_unnamed_addr  {
-; ARM64-LABEL: define dso_local void @fmin32(
-; ARM64-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; ARM64-NEXT:  [[ENTRY:.*:]]
-; ARM64-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
-; ARM64-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
-; ARM64-NEXT:    [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]])
-; ARM64-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
-; ARM64-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
-; ARM64-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
-; ARM64-NEXT:    [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]])
-; ARM64-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
-; ARM64-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
-; ARM64-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
-; ARM64-NEXT:    [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]])
-; ARM64-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
-; ARM64-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
-; ARM64-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
-; ARM64-NEXT:    [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]])
-; ARM64-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
-; ARM64-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
-; ARM64-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
-; ARM64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
-; ARM64-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
-; ARM64-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
-; ARM64-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
-; ARM64-NEXT:    [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]])
-; ARM64-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
-; ARM64-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
-; ARM64-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
-; ARM64-NEXT:    [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]])
-; ARM64-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
-; ARM64-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
-; ARM64-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
-; ARM64-NEXT:    [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]])
-; ARM64-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
-; ARM64-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
-; ARM64-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
-; ARM64-NEXT:    [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]])
-; ARM64-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
-; ARM64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmin32(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]])
+; CHECK-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]])
+; CHECK-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]])
+; CHECK-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]])
+; CHECK-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]])
+; CHECK-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]])
+; CHECK-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load float, ptr @input1_f32, align 16
@@ -96,46 +96,46 @@ entry:
 declare float @llvm.minimumnum.f32(float, float)
 
 define dso_local void @fmax32() local_unnamed_addr  {
-; ARM64-LABEL: define dso_local void @fmax32(
-; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
-; ARM64-NEXT:  [[ENTRY:.*:]]
-; ARM64-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
-; ARM64-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
-; ARM64-NEXT:    [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]])
-; ARM64-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
-; ARM64-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
-; ARM64-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
-; ARM64-NEXT:    [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]])
-; ARM64-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
-; ARM64-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
-; ARM64-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
-; ARM64-NEXT:    [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]])
-; ARM64-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
-; ARM64-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
-; ARM64-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
-; ARM64-NEXT:    [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]])
-; ARM64-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
-; ARM64-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
-; ARM64-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
-; ARM64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
-; ARM64-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
-; ARM64-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
-; ARM64-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
-; ARM64-NEXT:    [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]])
-; ARM64-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
-; ARM64-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
-; ARM64-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
-; ARM64-NEXT:    [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]])
-; ARM64-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
-; ARM64-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
-; ARM64-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
-; ARM64-NEXT:    [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]])
-; ARM64-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
-; ARM64-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
-; ARM64-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
-; ARM64-NEXT:    [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]])
-; ARM64-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
-; ARM64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmax32(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]])
+; CHECK-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]])
+; CHECK-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]])
+; CHECK-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]])
+; CHECK-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]])
+; CHECK-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]])
+; CHECK-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load float, ptr @input1_f32, align 16
@@ -180,46 +180,46 @@ entry:
 declare float @llvm.maximumnum.f32(float, float)
 
 define dso_local void @fmin64() local_unnamed_addr  {
-; ARM64-LABEL: define dso_local void @fmin64(
-; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
-; ARM64-NEXT:  [[ENTRY:.*:]]
-; ARM64-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
-; ARM64-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
-; ARM64-NEXT:    [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]])
-; ARM64-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
-; ARM64-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
-; ARM64-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
-; ARM64-NEXT:    [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]])
-; ARM64-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
-; ARM64-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
-; ARM64-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
-; ARM64-NEXT:    [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]])
-; ARM64-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
-; ARM64-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
-; ARM64-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
-; ARM64-NEXT:    [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]])
-; ARM64-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
-; ARM64-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
-; ARM64-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
-; ARM64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
-; ARM64-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
-; ARM64-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
-; ARM64-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
-; ARM64-NEXT:    [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]])
-; ARM64-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
-; ARM64-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
-; ARM64-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
-; ARM64-NEXT:    [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]])
-; ARM64-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
-; ARM64-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
-; ARM64-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
-; ARM64-NEXT:    [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]])
-; ARM64-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
-; ARM64-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
-; ARM64-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
-; ARM64-NEXT:    [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]])
-; ARM64-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
-; ARM64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmin64(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]])
+; CHECK-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]])
+; CHECK-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]])
+; CHECK-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]])
+; CHECK-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]])
+; CHECK-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]])
+; CHECK-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load double, ptr @input1_f64, align 16
@@ -264,46 +264,46 @@ entry:
 declare double @llvm.minimumnum.f64(double, double)
 
 define dso_local void @fmax64() local_unnamed_addr  {
-; ARM64-LABEL: define dso_local void @fmax64(
-; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
-; ARM64-NEXT:  [[ENTRY:.*:]]
-; ARM64-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
-; ARM64-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
-; ARM64-NEXT:    [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]])
-; ARM64-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
-; ARM64-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
-; ARM64-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
-; ARM64-NEXT:    [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]])
-; ARM64-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
-; ARM64-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
-; ARM64-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
-; ARM64-NEXT:    [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]])
-; ARM64-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
-; ARM64-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
-; ARM64-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
-; ARM64-NEXT:    [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]])
-; ARM64-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
-; ARM64-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
-; ARM64-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
-; ARM64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
-; ARM64-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
-; ARM64-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
-; ARM64-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
-; ARM64-NEXT:    [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]])
-; ARM64-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
-; ARM64-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
-; ARM64-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
-; ARM64-NEXT:    [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]])
-; ARM64-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
-; ARM64-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
-; ARM64-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
-; ARM64-NEXT:    [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]])
-; ARM64-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
-; ARM64-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
-; ARM64-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
-; ARM64-NEXT:    [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]])
-; ARM64-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
-; ARM64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmax64(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]])
+; CHECK-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]])
+; CHECK-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]])
+; CHECK-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]])
+; CHECK-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]])
+; CHECK-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]])
+; CHECK-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load double, ptr @input1_f64, align 16
@@ -348,46 +348,46 @@ entry:
 declare double @llvm.maximumnum.f64(double, double)
 
 define dso_local void @fmin16() local_unnamed_addr  {
-; ARM64-LABEL: define dso_local void @fmin16(
-; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
-; ARM64-NEXT:  [[ENTRY:.*:]]
-; ARM64-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
-; ARM64-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
-; ARM64-NEXT:    [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]])
-; ARM64-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
-; ARM64-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
-; ARM64-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
-; ARM64-NEXT:    [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]])
-; ARM64-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
-; ARM64-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
-; ARM64-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
-; ARM64-NEXT:    [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]])
-; ARM64-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
-; ARM64-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
-; ARM64-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
-; ARM64-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
-; ARM64-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
-; ARM64-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
-; ARM64-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
-; ARM64-NEXT:    [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]])
-; ARM64-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
-; ARM64-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
-; ARM64-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
-; ARM64-NEXT:    [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]])
-; ARM64-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
-; ARM64-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
-; ARM64-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
-; ARM64-NEXT:    [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]])
-; ARM64-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
-; ARM64-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
-; ARM64-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
-; ARM64-NEXT:    [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]])
-; ARM64-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
-; ARM64-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
-; ARM64-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
-; ARM64-NEXT:    [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]])
-; ARM64-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
-; ARM64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmin16(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]])
+; CHECK-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]])
+; CHECK-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]])
+; CHECK-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
+; CHECK-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]])
+; CHECK-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]])
+; CHECK-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]])
+; CHECK-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]])
+; CHECK-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]])
+; CHECK-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load half, ptr @input1_f16, align 16
@@ -432,46 +432,46 @@ entry:
 declare half @llvm.minimumnum.f16(half, half)
 
 define dso_local void @fmax16() local_unnamed_addr  {
-; ARM64-LABEL: define dso_local void @fmax16(
-; ARM64-SAME: ) local_unnamed_addr #[[ATTR0]] {
-; ARM64-NEXT:  [[ENTRY:.*:]]
-; ARM64-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
-; ARM64-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
-; ARM64-NEXT:    [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]])
-; ARM64-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
-; ARM64-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
-; ARM64-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
-; ARM64-NEXT:    [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]])
-; ARM64-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
-; ARM64-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
-; ARM64-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
-; ARM64-NEXT:    [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]])
-; ARM64-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
-; ARM64-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
-; ARM64-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
-; ARM64-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
-; ARM64-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
-; ARM64-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
-; ARM64-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
-; ARM64-NEXT:    [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]])
-; ARM64-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
-; ARM64-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
-; ARM64-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
-; ARM64-NEXT:    [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]])
-; ARM64-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
-; ARM64-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
-; ARM64-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
-; ARM64-NEXT:    [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]])
-; ARM64-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
-; ARM64-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
-; ARM64-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
-; ARM64-NEXT:    [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]])
-; ARM64-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
-; ARM64-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
-; ARM64-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
-; ARM64-NEXT:    [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]])
-; ARM64-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
-; ARM64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmax16(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]])
+; CHECK-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]])
+; CHECK-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]])
+; CHECK-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
+; CHECK-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]])
+; CHECK-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]])
+; CHECK-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]])
+; CHECK-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]])
+; CHECK-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]])
+; CHECK-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load half, ptr @input1_f16, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll
index c972255d8127d..1fb75e3d45b08 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt --passes=slp-vectorizer --mtriple=riscv64 -mattr="+zvfh,+v,+zfh" -S < %s | FileCheck %s --check-prefix=RV64
+; RUN: opt --passes=slp-vectorizer --mtriple=riscv64 -mattr="+zvfh,+v,+zfh" -S < %s | FileCheck %s
 
 @input1_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
 @input2_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
@@ -12,46 +12,46 @@
 @output_f16 = dso_local local_unnamed_addr global [9 x half] zeroinitializer, align 16
 
 define dso_local void @fmin32() local_unnamed_addr  {
-; RV64-LABEL: define dso_local void @fmin32(
-; RV64-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; RV64-NEXT:  [[ENTRY:.*:]]
-; RV64-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
-; RV64-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
-; RV64-NEXT:    [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]])
-; RV64-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
-; RV64-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
-; RV64-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
-; RV64-NEXT:    [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]])
-; RV64-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
-; RV64-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
-; RV64-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
-; RV64-NEXT:    [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]])
-; RV64-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
-; RV64-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
-; RV64-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
-; RV64-NEXT:    [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]])
-; RV64-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
-; RV64-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
-; RV64-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
-; RV64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
-; RV64-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
-; RV64-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
-; RV64-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
-; RV64-NEXT:    [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]])
-; RV64-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
-; RV64-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
-; RV64-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
-; RV64-NEXT:    [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]])
-; RV64-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
-; RV64-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
-; RV64-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
-; RV64-NEXT:    [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]])
-; RV64-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
-; RV64-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
-; RV64-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
-; RV64-NEXT:    [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]])
-; RV64-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
-; RV64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmin32(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]])
+; CHECK-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]])
+; CHECK-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]])
+; CHECK-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]])
+; CHECK-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]])
+; CHECK-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]])
+; CHECK-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load float, ptr @input1_f32, align 16
@@ -96,46 +96,46 @@ entry:
 declare float @llvm.minimumnum.f32(float, float)
 
 define dso_local void @fmax32() local_unnamed_addr  {
-; RV64-LABEL: define dso_local void @fmax32(
-; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*:]]
-; RV64-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
-; RV64-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
-; RV64-NEXT:    [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]])
-; RV64-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
-; RV64-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
-; RV64-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
-; RV64-NEXT:    [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]])
-; RV64-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
-; RV64-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
-; RV64-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
-; RV64-NEXT:    [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]])
-; RV64-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
-; RV64-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
-; RV64-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
-; RV64-NEXT:    [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]])
-; RV64-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
-; RV64-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
-; RV64-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
-; RV64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
-; RV64-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
-; RV64-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
-; RV64-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
-; RV64-NEXT:    [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]])
-; RV64-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
-; RV64-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
-; RV64-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
-; RV64-NEXT:    [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]])
-; RV64-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
-; RV64-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
-; RV64-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
-; RV64-NEXT:    [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]])
-; RV64-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
-; RV64-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
-; RV64-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
-; RV64-NEXT:    [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]])
-; RV64-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
-; RV64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmax32(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]])
+; CHECK-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]])
+; CHECK-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]])
+; CHECK-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]])
+; CHECK-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]])
+; CHECK-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]])
+; CHECK-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load float, ptr @input1_f32, align 16
@@ -180,46 +180,46 @@ entry:
 declare float @llvm.maximumnum.f32(float, float)
 
 define dso_local void @fmin64() local_unnamed_addr  {
-; RV64-LABEL: define dso_local void @fmin64(
-; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*:]]
-; RV64-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
-; RV64-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
-; RV64-NEXT:    [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]])
-; RV64-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
-; RV64-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
-; RV64-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
-; RV64-NEXT:    [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]])
-; RV64-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
-; RV64-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
-; RV64-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
-; RV64-NEXT:    [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]])
-; RV64-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
-; RV64-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
-; RV64-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
-; RV64-NEXT:    [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]])
-; RV64-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
-; RV64-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
-; RV64-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
-; RV64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
-; RV64-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
-; RV64-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
-; RV64-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
-; RV64-NEXT:    [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]])
-; RV64-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
-; RV64-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
-; RV64-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
-; RV64-NEXT:    [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]])
-; RV64-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
-; RV64-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
-; RV64-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
-; RV64-NEXT:    [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]])
-; RV64-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
-; RV64-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
-; RV64-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
-; RV64-NEXT:    [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]])
-; RV64-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
-; RV64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmin64(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]])
+; CHECK-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]])
+; CHECK-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]])
+; CHECK-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]])
+; CHECK-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]])
+; CHECK-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]])
+; CHECK-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load double, ptr @input1_f64, align 16
@@ -264,46 +264,46 @@ entry:
 declare double @llvm.minimumnum.f64(double, double)
 
 define dso_local void @fmax64() local_unnamed_addr  {
-; RV64-LABEL: define dso_local void @fmax64(
-; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*:]]
-; RV64-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
-; RV64-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
-; RV64-NEXT:    [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]])
-; RV64-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
-; RV64-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
-; RV64-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
-; RV64-NEXT:    [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]])
-; RV64-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
-; RV64-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
-; RV64-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
-; RV64-NEXT:    [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]])
-; RV64-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
-; RV64-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
-; RV64-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
-; RV64-NEXT:    [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]])
-; RV64-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
-; RV64-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
-; RV64-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
-; RV64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
-; RV64-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
-; RV64-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
-; RV64-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
-; RV64-NEXT:    [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]])
-; RV64-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
-; RV64-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
-; RV64-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
-; RV64-NEXT:    [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]])
-; RV64-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
-; RV64-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
-; RV64-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
-; RV64-NEXT:    [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]])
-; RV64-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
-; RV64-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
-; RV64-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
-; RV64-NEXT:    [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]])
-; RV64-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
-; RV64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmax64(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]])
+; CHECK-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]])
+; CHECK-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]])
+; CHECK-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]])
+; CHECK-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]])
+; CHECK-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]])
+; CHECK-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load double, ptr @input1_f64, align 16
@@ -348,46 +348,46 @@ entry:
 declare double @llvm.maximumnum.f64(double, double)
 
 define dso_local void @fmin16() local_unnamed_addr  {
-; RV64-LABEL: define dso_local void @fmin16(
-; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*:]]
-; RV64-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
-; RV64-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
-; RV64-NEXT:    [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]])
-; RV64-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
-; RV64-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
-; RV64-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
-; RV64-NEXT:    [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]])
-; RV64-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
-; RV64-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
-; RV64-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
-; RV64-NEXT:    [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]])
-; RV64-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
-; RV64-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
-; RV64-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
-; RV64-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
-; RV64-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
-; RV64-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
-; RV64-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
-; RV64-NEXT:    [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]])
-; RV64-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
-; RV64-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
-; RV64-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
-; RV64-NEXT:    [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]])
-; RV64-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
-; RV64-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
-; RV64-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
-; RV64-NEXT:    [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]])
-; RV64-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
-; RV64-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
-; RV64-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
-; RV64-NEXT:    [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]])
-; RV64-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
-; RV64-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
-; RV64-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
-; RV64-NEXT:    [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]])
-; RV64-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
-; RV64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmin16(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]])
+; CHECK-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]])
+; CHECK-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]])
+; CHECK-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
+; CHECK-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]])
+; CHECK-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]])
+; CHECK-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]])
+; CHECK-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]])
+; CHECK-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]])
+; CHECK-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load half, ptr @input1_f16, align 16
@@ -432,46 +432,46 @@ entry:
 declare half @llvm.minimumnum.f16(half, half)
 
 define dso_local void @fmax16() local_unnamed_addr  {
-; RV64-LABEL: define dso_local void @fmax16(
-; RV64-SAME: ) local_unnamed_addr #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*:]]
-; RV64-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
-; RV64-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
-; RV64-NEXT:    [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]])
-; RV64-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
-; RV64-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
-; RV64-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
-; RV64-NEXT:    [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]])
-; RV64-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
-; RV64-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
-; RV64-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
-; RV64-NEXT:    [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]])
-; RV64-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
-; RV64-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
-; RV64-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
-; RV64-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
-; RV64-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
-; RV64-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
-; RV64-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
-; RV64-NEXT:    [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]])
-; RV64-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
-; RV64-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
-; RV64-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
-; RV64-NEXT:    [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]])
-; RV64-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
-; RV64-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
-; RV64-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
-; RV64-NEXT:    [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]])
-; RV64-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
-; RV64-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
-; RV64-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
-; RV64-NEXT:    [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]])
-; RV64-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
-; RV64-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
-; RV64-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
-; RV64-NEXT:    [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]])
-; RV64-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
-; RV64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmax16(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]])
+; CHECK-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]])
+; CHECK-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]])
+; CHECK-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
+; CHECK-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]])
+; CHECK-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]])
+; CHECK-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]])
+; CHECK-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]])
+; CHECK-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]])
+; CHECK-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load half, ptr @input1_f16, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll
index 9f8783baea4a9..889bd5aa81a06 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt --passes=slp-vectorizer --mtriple=x86_64 -S < %s | FileCheck %s --check-prefix=X64
+; RUN: opt --passes=slp-vectorizer --mtriple=x86_64 -S < %s | FileCheck %s
 
 @input1_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
 @input2_f32 = dso_local local_unnamed_addr global [9 x float] zeroinitializer, align 16
@@ -12,45 +12,45 @@
 @output_f16 = dso_local local_unnamed_addr global [9 x half] zeroinitializer, align 16
 
 define dso_local void @fmin32() local_unnamed_addr  {
-; X64-LABEL: define dso_local void @fmin32() local_unnamed_addr {
-; X64-NEXT:  [[ENTRY:.*:]]
-; X64-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
-; X64-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
-; X64-NEXT:    [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]])
-; X64-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
-; X64-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
-; X64-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
-; X64-NEXT:    [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]])
-; X64-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
-; X64-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
-; X64-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
-; X64-NEXT:    [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]])
-; X64-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
-; X64-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
-; X64-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
-; X64-NEXT:    [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]])
-; X64-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
-; X64-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
-; X64-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
-; X64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
-; X64-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
-; X64-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
-; X64-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
-; X64-NEXT:    [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]])
-; X64-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
-; X64-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
-; X64-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
-; X64-NEXT:    [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]])
-; X64-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
-; X64-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
-; X64-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
-; X64-NEXT:    [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]])
-; X64-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
-; X64-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
-; X64-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
-; X64-NEXT:    [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]])
-; X64-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
-; X64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmin32() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]])
+; CHECK-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]])
+; CHECK-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]])
+; CHECK-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]])
+; CHECK-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]])
+; CHECK-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]])
+; CHECK-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load float, ptr @input1_f32, align 16
@@ -95,45 +95,45 @@ entry:
 declare float @llvm.minimumnum.f32(float, float)
 
 define dso_local void @fmax32() local_unnamed_addr  {
-; X64-LABEL: define dso_local void @fmax32() local_unnamed_addr {
-; X64-NEXT:  [[ENTRY:.*:]]
-; X64-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
-; X64-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
-; X64-NEXT:    [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]])
-; X64-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
-; X64-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
-; X64-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
-; X64-NEXT:    [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]])
-; X64-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
-; X64-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
-; X64-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
-; X64-NEXT:    [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]])
-; X64-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
-; X64-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
-; X64-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
-; X64-NEXT:    [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]])
-; X64-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
-; X64-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
-; X64-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
-; X64-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
-; X64-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
-; X64-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
-; X64-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
-; X64-NEXT:    [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]])
-; X64-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
-; X64-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
-; X64-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
-; X64-NEXT:    [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]])
-; X64-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
-; X64-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
-; X64-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
-; X64-NEXT:    [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]])
-; X64-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
-; X64-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
-; X64-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
-; X64-NEXT:    [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]])
-; X64-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
-; X64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmax32() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @input1_f32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @input2_f32, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]])
+; CHECK-NEXT:    store float [[TMP2]], ptr @output_f32, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]])
+; CHECK-NEXT:    store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]])
+; CHECK-NEXT:    store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]])
+; CHECK-NEXT:    store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]])
+; CHECK-NEXT:    store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]])
+; CHECK-NEXT:    store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]])
+; CHECK-NEXT:    store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load float, ptr @input1_f32, align 16
@@ -178,45 +178,45 @@ entry:
 declare float @llvm.maximumnum.f32(float, float)
 
 define dso_local void @fmin64() local_unnamed_addr  {
-; X64-LABEL: define dso_local void @fmin64() local_unnamed_addr {
-; X64-NEXT:  [[ENTRY:.*:]]
-; X64-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
-; X64-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
-; X64-NEXT:    [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]])
-; X64-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
-; X64-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
-; X64-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
-; X64-NEXT:    [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]])
-; X64-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
-; X64-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
-; X64-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
-; X64-NEXT:    [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]])
-; X64-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
-; X64-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
-; X64-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
-; X64-NEXT:    [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]])
-; X64-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
-; X64-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
-; X64-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
-; X64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
-; X64-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
-; X64-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
-; X64-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
-; X64-NEXT:    [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]])
-; X64-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
-; X64-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
-; X64-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
-; X64-NEXT:    [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]])
-; X64-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
-; X64-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
-; X64-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
-; X64-NEXT:    [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]])
-; X64-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
-; X64-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
-; X64-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
-; X64-NEXT:    [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]])
-; X64-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
-; X64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmin64() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]])
+; CHECK-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]])
+; CHECK-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]])
+; CHECK-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]])
+; CHECK-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]])
+; CHECK-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]])
+; CHECK-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load double, ptr @input1_f64, align 16
@@ -261,45 +261,45 @@ entry:
 declare double @llvm.minimumnum.f64(double, double)
 
 define dso_local void @fmax64() local_unnamed_addr  {
-; X64-LABEL: define dso_local void @fmax64() local_unnamed_addr {
-; X64-NEXT:  [[ENTRY:.*:]]
-; X64-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
-; X64-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
-; X64-NEXT:    [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]])
-; X64-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
-; X64-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
-; X64-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
-; X64-NEXT:    [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]])
-; X64-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
-; X64-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
-; X64-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
-; X64-NEXT:    [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]])
-; X64-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
-; X64-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
-; X64-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
-; X64-NEXT:    [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]])
-; X64-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
-; X64-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
-; X64-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
-; X64-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
-; X64-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
-; X64-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
-; X64-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
-; X64-NEXT:    [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]])
-; X64-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
-; X64-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
-; X64-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
-; X64-NEXT:    [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]])
-; X64-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
-; X64-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
-; X64-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
-; X64-NEXT:    [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]])
-; X64-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
-; X64-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
-; X64-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
-; X64-NEXT:    [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]])
-; X64-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
-; X64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmax64() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @input1_f64, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @input2_f64, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]])
+; CHECK-NEXT:    store double [[TMP2]], ptr @output_f64, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]])
+; CHECK-NEXT:    store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]])
+; CHECK-NEXT:    store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]])
+; CHECK-NEXT:    store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]])
+; CHECK-NEXT:    store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]])
+; CHECK-NEXT:    store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]])
+; CHECK-NEXT:    store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load double, ptr @input1_f64, align 16
@@ -344,45 +344,45 @@ entry:
 declare double @llvm.maximumnum.f64(double, double)
 
 define dso_local void @fmin16() local_unnamed_addr  {
-; X64-LABEL: define dso_local void @fmin16() local_unnamed_addr {
-; X64-NEXT:  [[ENTRY:.*:]]
-; X64-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
-; X64-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
-; X64-NEXT:    [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]])
-; X64-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
-; X64-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
-; X64-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
-; X64-NEXT:    [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]])
-; X64-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
-; X64-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
-; X64-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
-; X64-NEXT:    [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]])
-; X64-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
-; X64-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
-; X64-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
-; X64-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
-; X64-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
-; X64-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
-; X64-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
-; X64-NEXT:    [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]])
-; X64-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
-; X64-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
-; X64-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
-; X64-NEXT:    [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]])
-; X64-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
-; X64-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
-; X64-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
-; X64-NEXT:    [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]])
-; X64-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
-; X64-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
-; X64-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
-; X64-NEXT:    [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]])
-; X64-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
-; X64-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
-; X64-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
-; X64-NEXT:    [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]])
-; X64-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
-; X64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmin16() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]])
+; CHECK-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]])
+; CHECK-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]])
+; CHECK-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]])
+; CHECK-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]])
+; CHECK-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]])
+; CHECK-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]])
+; CHECK-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]])
+; CHECK-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]])
+; CHECK-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load half, ptr @input1_f16, align 16
@@ -427,45 +427,45 @@ entry:
 declare half @llvm.minimumnum.f16(half, half)
 
 define dso_local void @fmax16() local_unnamed_addr  {
-; X64-LABEL: define dso_local void @fmax16() local_unnamed_addr {
-; X64-NEXT:  [[ENTRY:.*:]]
-; X64-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
-; X64-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
-; X64-NEXT:    [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]])
-; X64-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
-; X64-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
-; X64-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
-; X64-NEXT:    [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]])
-; X64-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
-; X64-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
-; X64-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
-; X64-NEXT:    [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]])
-; X64-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
-; X64-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
-; X64-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
-; X64-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
-; X64-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
-; X64-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
-; X64-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
-; X64-NEXT:    [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]])
-; X64-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
-; X64-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
-; X64-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
-; X64-NEXT:    [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]])
-; X64-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
-; X64-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
-; X64-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
-; X64-NEXT:    [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]])
-; X64-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
-; X64-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
-; X64-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
-; X64-NEXT:    [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]])
-; X64-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
-; X64-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
-; X64-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
-; X64-NEXT:    [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]])
-; X64-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
-; X64-NEXT:    ret void
+; CHECK-LABEL: define dso_local void @fmax16() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr @input1_f16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr @input2_f16, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]])
+; CHECK-NEXT:    store half [[TMP2]], ptr @output_f16, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]])
+; CHECK-NEXT:    store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]])
+; CHECK-NEXT:    store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]])
+; CHECK-NEXT:    store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]])
+; CHECK-NEXT:    store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]])
+; CHECK-NEXT:    store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]])
+; CHECK-NEXT:    store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]])
+; CHECK-NEXT:    store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]])
+; CHECK-NEXT:    store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16
+; CHECK-NEXT:    ret void
 ;
 entry:
   %input0_0 = load half, ptr @input1_f16, align 16



More information about the llvm-commits mailing list