[llvm] [LLVM][AArch64][tblgen]: Match clamp pattern (PR #75529)

Tue Dec 19 11:04:01 PST 2023

https://github.com/hassnaaHamdi updated https://github.com/llvm/llvm-project/pull/75529

>From 20ae78bd9b1a7f7ea88d2b3b2c63c31e7e9631c4 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Thu, 14 Dec 2023 20:31:21 +0000
Subject: [PATCH 1/4] [LLVM][AArch64][tblgen]: Match clamp pattern

Replace pattern min(max(v1,v2),v3) by clamp
Add tests for uc;amp, sclamp, bfclamp, fclamp.

Change-Id: I6c48835abe0a4ea679fd99d678716642c2572146
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  34 +++-
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |   8 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  20 ++-
 .../CodeGen/AArch64/sve2-min-max-clamp.ll     | 153 ++++++++++++++++++
 4 files changed, 205 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 50527e08a06165..d60462131f6da7 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3802,9 +3802,6 @@ let Predicates = [HasSVE2BitPerm] in {
 let Predicates = [HasSVE2p1_or_HasSME] in {
 defm REVD_ZPmZ : sve2_int_perm_revd<"revd", AArch64revd_mt>;
 
-defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0, int_aarch64_sve_sclamp>;
-defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1, int_aarch64_sve_uclamp>;
-
 defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
 } // End HasSVE2p1_or_HasSME
 
@@ -3813,7 +3810,6 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasSVE2p1_or_HasSME2] in {
-defm FCLAMP_ZZZ : sve2p1_fclamp<"fclamp", int_aarch64_sve_fclamp>;
 
 defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
 defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
@@ -4054,10 +4050,36 @@ defm BFMAXNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmaxnm_p>;
 defm BFMINNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fminnm_p>;
 
 defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul", int_aarch64_sve_fmul_lane>;
-
-defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", int_aarch64_sve_fclamp>;
 } // End HasSVE2p1_or_HasSME2p1, HasB16B16
 
+// Replace pattern min(max(v1,v2),v3) by clamp
+def clamp_min_max : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
+                              [(AArch64smin_p (SVEAllActive),
+                                  (AArch64smax_p (SVEAllActive), node:$Zd, node:$Zn),
+                                  node:$Zm),
+                              (AArch64fmin_p (SVEAllActive),
+                                  (AArch64fmax_p (SVEAllActive), node:$Zd, node:$Zn),
+                                  node:$Zm)
+                               ]>;
+def uclamp_min_max : PatFrag<(ops node:$Zd, node:$Zn, node:$Zm),
+                              (AArch64umin_p (SVEAllActive),
+                                  (AArch64umax_p (SVEAllActive), node:$Zd, node:$Zn),
+                                  node:$Zm)>;
+def bfclamp_min_max : PatFrag<(ops node:$pg, node:$Zd, node:$Zn, node:$Zm),
+                              (int_aarch64_sve_fmin node:$pg,
+                               (nxv8bf16 (int_aarch64_sve_fmax node:$pg, node:$Zd, node:$Zn)),
+                               node:$Zm)>;
+
+let Predicates = [HasSVE2p1_or_HasSME] in {
+defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0, int_aarch64_sve_sclamp, clamp_min_max>;
+defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1, int_aarch64_sve_uclamp, uclamp_min_max>;
+}
+let Predicates = [HasSVE2p1_or_HasSME2] in {
+defm FCLAMP_ZZZ : sve2p1_fclamp<"fclamp", int_aarch64_sve_fclamp, clamp_min_max>;
+}
+let Predicates = [HasSVE2p1, HasB16B16] in {
+defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", int_aarch64_sve_fclamp, bfclamp_min_max>;
+}
 
 //===----------------------------------------------------------------------===//
 // SME2.1 or SVE2.1 instructions
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 4f8917618ea40a..f80b85772068da 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -1282,7 +1282,8 @@ class sve2_clamp<string asm, bits<2> sz, bit U, ZPRRegOp zpr_ty>
   let ElementSize = zpr_ty.ElementSize;
 }
 
-multiclass sve2_clamp<string asm, bit U, SDPatternOperator op> {
+multiclass sve2_clamp<string asm, bit U, SDPatternOperator op,
+                      SDPatternOperator predicated_op = null_frag> {
   def _B : sve2_clamp<asm, 0b00, U, ZPR8>;
   def _H : sve2_clamp<asm, 0b01, U, ZPR16>;
   def _S : sve2_clamp<asm, 0b10, U, ZPR32>;
@@ -1292,6 +1293,11 @@ multiclass sve2_clamp<string asm, bit U, SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_3_Op_Pat<nxv16i8, predicated_op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, predicated_op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, predicated_op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, predicated_op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_perm_sel_p<string asm, PPRRegOp ppr_ty, Operand imm_ty>
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 9edf26052247ae..1cd7018683f31c 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -470,6 +470,12 @@ class SVE_2_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op,
 : Pat<(vtd (op (pt (SVEAllActive)), vt1:$Op1, vt2:$Op2)),
       (inst $Op1, $Op2)>;
 
+class SVE_3_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op,
+                               ValueType vt1, ValueType pt, ValueType vt2,
+                               ValueType vt3, Instruction inst>
+: Pat<(vtd (op (pt (SVEAllActive)), vt1:$Op1, vt2:$Op2, vt3:$Op3)),
+      (inst $Op1, $Op2, $Op3)>;
+
 class SVE_2_Op_Pred_All_Active_Pt<ValueType vtd, SDPatternOperator op,
                                   ValueType pt, ValueType vt1, ValueType vt2,
                                   Instruction inst>
@@ -9228,7 +9234,8 @@ class sve2p1_fclamp<string asm, bits<2> sz, ZPRRegOp zpr_ty>
   let hasSideEffects = 0;
 }
 
-multiclass sve2p1_fclamp<string asm, SDPatternOperator op> {
+multiclass sve2p1_fclamp<string asm, SDPatternOperator op,
+                          SDPatternOperator predicated_op = null_frag> {
   def _H : sve2p1_fclamp<asm, 0b01, ZPR16>;
   def _S : sve2p1_fclamp<asm, 0b10, ZPR32>;
   def _D : sve2p1_fclamp<asm, 0b11, ZPR64>;
@@ -9236,11 +9243,18 @@ multiclass sve2p1_fclamp<string asm, SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_3_Op_Pat<nxv8f16, predicated_op, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4f32, predicated_op, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2f64, predicated_op, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2p1_bfclamp<string asm, SDPatternOperator op> {
+multiclass sve2p1_bfclamp<string asm, SDPatternOperator op,
+                          SDPatternOperator predicated_op = null_frag> {
   def NAME : sve2p1_fclamp<asm, 0b00, ZPR16>;
-  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+    def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+
+    def : SVE_3_Op_Pred_All_Active<nxv8bf16, predicated_op, nxv8bf16, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
 }
 
 // SVE two-way dot product
diff --git a/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll b/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll
new file mode 100644
index 00000000000000..35e0b2ef3af05f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 < %s | FileCheck %s
+
+; Replace pattern min(max(v1,v2),v3) by clamp
+
+define <vscale x 16 x i8> @uclampi8(<vscale x 16 x i8> %c, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uclampi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uclamp z0.b, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+  %res = tail call <vscale x 16 x i8> @llvm.umin.nxv16i8(<vscale x 16 x i8> %min, <vscale x 16 x i8> %c)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @uclampi16(<vscale x 8 x i16> %c, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uclampi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uclamp z0.h, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 8 x i16> @llvm.umax.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = tail call <vscale x 8 x i16> @llvm.umin.nxv8i16(<vscale x 8 x i16> %min, <vscale x 8 x i16> %c)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 4 x i32> @uclampi32(<vscale x 4 x i32> %c, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uclampi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uclamp z0.s, z1.s, z2.s
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  %res = tail call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> %min, <vscale x 4 x i32> %c)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @uclampi64(<vscale x 2 x i64> %c, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: uclampi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uclamp z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 2 x i64> @llvm.umax.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
+  %res = tail call <vscale x 2 x i64> @llvm.umin.nxv2i64(<vscale x 2 x i64> %min, <vscale x 2 x i64> %c)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 16 x i8> @sclampi8(<vscale x 16 x i8> %c, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: sclampi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sclamp z0.b, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+  %res = tail call <vscale x 16 x i8> @llvm.smin.nxv16i8(<vscale x 16 x i8> %min, <vscale x 16 x i8> %c)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @sclampi16(<vscale x 8 x i16> %c, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sclampi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sclamp z0.h, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 8 x i16> @llvm.smax.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = tail call <vscale x 8 x i16> @llvm.smin.nxv8i16(<vscale x 8 x i16> %min, <vscale x 8 x i16> %c)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 4 x i32> @sclampi32(<vscale x 4 x i32> %c, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sclampi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sclamp z0.s, z1.s, z2.s
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  %res = tail call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %min, <vscale x 4 x i32> %c)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @sclampi64(<vscale x 2 x i64> %c, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sclampi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sclamp z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 2 x i64> @llvm.smax.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
+  %res = tail call <vscale x 2 x i64> @llvm.smin.nxv2i64(<vscale x 2 x i64> %min, <vscale x 2 x i64> %c)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 8 x bfloat> @fclampbf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
+; CHECK-LABEL: fclampbf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfclamp z0.h, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %min = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1>%pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  %res = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1>%pg, <vscale x 8 x bfloat> %min, <vscale x 8 x bfloat> %c)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x half> @fclampf16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
+; CHECK-LABEL: fclampf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fclamp z0.h, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %min = call <vscale x 8 x half> @llvm.maximum.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  %res = call <vscale x 8 x half> @llvm.minimum.nxv8f16(<vscale x 8 x half> %min, <vscale x 8 x half> %c)
+  ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x float> @fclampf32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) {
+; CHECK-LABEL: fclampf32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fclamp z0.s, z1.s, z2.s
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 4 x float> @llvm.maximum.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  %res = tail call <vscale x 4 x float> @llvm.minimum.nxv4f32(<vscale x 4 x float> %min, <vscale x 4 x float> %c)
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x double> @fclampf64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) {
+; CHECK-LABEL: fclampf64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fclamp z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 2 x double> @llvm.maximum.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  %res = tail call <vscale x 2 x double> @llvm.minimum.nxv2f64(<vscale x 2 x double> %min, <vscale x 2 x double> %c)
+  ret <vscale x 2 x double> %res
+}
+
+declare <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 16 x i8> @llvm.umin.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.umax.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i16> @llvm.umin.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.umax.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.umin.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 16 x i8> @llvm.smin.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.smax.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i16> @llvm.smin.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.smax.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.smin.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x half>   @llvm.maximum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x half>   @llvm.minimum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float>  @llvm.maximum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x float>  @llvm.minimum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.maximum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.minimum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)

>From dc0cc53d4a21773b4a0c37a576c22bed07237587 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Mon, 18 Dec 2023 17:37:08 +0000
Subject: [PATCH 2/4] Include the intrinsic and the pattern into 1 Patfrags

Change-Id: I4cffc3c4f66e6a1dbeca2637b06db56509291909
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 60 ++++++++++---------
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |  8 +--
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 12 +---
 3 files changed, 35 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index d60462131f6da7..63809eea7f52b9 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -316,6 +316,32 @@ def AArch64ssra : PatFrags<(ops node:$op1, node:$op2, node:$op3),
                            [(int_aarch64_sve_ssra node:$op1, node:$op2, node:$op3),
                             (add node:$op1, (AArch64asr_p (SVEAnyPredicate), node:$op2, (SVEShiftSplatImmR (i32 node:$op3))))]>;
 
+// Replace pattern min(max(v1,v2),v3) by clamp
+def AArch64sclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
+                              [(int_aarch64_sve_sclamp node:$Zd, node:$Zn, node:$Zm),
+                              (AArch64smin_p (SVEAllActive),
+                                  (AArch64smax_p (SVEAllActive), node:$Zd, node:$Zn),
+                                  node:$Zm)
+                               ]>;
+def AArch64uclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
+                              [(int_aarch64_sve_uclamp node:$Zd, node:$Zn, node:$Zm),
+                               (AArch64umin_p (SVEAllActive),
+                                  (AArch64umax_p (SVEAllActive), node:$Zd, node:$Zn),
+                                  node:$Zm)
+                              ]>;
+def AArch64fclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
+                              [(int_aarch64_sve_fclamp node:$Zd, node:$Zn, node:$Zm),
+                              (AArch64fmin_p (SVEAllActive),
+                                  (AArch64fmax_p (SVEAllActive), node:$Zd, node:$Zn),
+                                  node:$Zm)
+                               ]>;
+def AArch64bfclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
+                              [(int_aarch64_sve_fclamp node:$Zd, node:$Zn, node:$Zm),
+                              (int_aarch64_sve_fmin (nxv8i1 (SVEAllActive)),
+                                  (nxv8bf16 (int_aarch64_sve_fmax (nxv8i1 (SVEAllActive)), node:$Zd, node:$Zn)),
+                               node:$Zm)
+                               ]>;
+
 def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
   SDTCVecEltisVT<1,i1>
@@ -3802,6 +3828,9 @@ let Predicates = [HasSVE2BitPerm] in {
 let Predicates = [HasSVE2p1_or_HasSME] in {
 defm REVD_ZPmZ : sve2_int_perm_revd<"revd", AArch64revd_mt>;
 
+defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0, AArch64sclamp>;
+defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1, AArch64uclamp>;
+
 defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
 } // End HasSVE2p1_or_HasSME
 
@@ -3810,6 +3839,7 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasSVE2p1_or_HasSME2] in {
+defm FCLAMP_ZZZ : sve2p1_fclamp<"fclamp", AArch64fclamp>;
 
 defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
 defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
@@ -4050,36 +4080,10 @@ defm BFMAXNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmaxnm_p>;
 defm BFMINNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fminnm_p>;
 
 defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul", int_aarch64_sve_fmul_lane>;
-} // End HasSVE2p1_or_HasSME2p1, HasB16B16
 
-// Replace pattern min(max(v1,v2),v3) by clamp
-def clamp_min_max : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
-                              [(AArch64smin_p (SVEAllActive),
-                                  (AArch64smax_p (SVEAllActive), node:$Zd, node:$Zn),
-                                  node:$Zm),
-                              (AArch64fmin_p (SVEAllActive),
-                                  (AArch64fmax_p (SVEAllActive), node:$Zd, node:$Zn),
-                                  node:$Zm)
-                               ]>;
-def uclamp_min_max : PatFrag<(ops node:$Zd, node:$Zn, node:$Zm),
-                              (AArch64umin_p (SVEAllActive),
-                                  (AArch64umax_p (SVEAllActive), node:$Zd, node:$Zn),
-                                  node:$Zm)>;
-def bfclamp_min_max : PatFrag<(ops node:$pg, node:$Zd, node:$Zn, node:$Zm),
-                              (int_aarch64_sve_fmin node:$pg,
-                               (nxv8bf16 (int_aarch64_sve_fmax node:$pg, node:$Zd, node:$Zn)),
-                               node:$Zm)>;
+defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", AArch64bfclamp>;
+} // End HasSVE2p1_or_HasSME2p1, HasB16B16
 
-let Predicates = [HasSVE2p1_or_HasSME] in {
-defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0, int_aarch64_sve_sclamp, clamp_min_max>;
-defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1, int_aarch64_sve_uclamp, uclamp_min_max>;
-}
-let Predicates = [HasSVE2p1_or_HasSME2] in {
-defm FCLAMP_ZZZ : sve2p1_fclamp<"fclamp", int_aarch64_sve_fclamp, clamp_min_max>;
-}
-let Predicates = [HasSVE2p1, HasB16B16] in {
-defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", int_aarch64_sve_fclamp, bfclamp_min_max>;
-}
 
 //===----------------------------------------------------------------------===//
 // SME2.1 or SVE2.1 instructions
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index f80b85772068da..4f8917618ea40a 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -1282,8 +1282,7 @@ class sve2_clamp<string asm, bits<2> sz, bit U, ZPRRegOp zpr_ty>
   let ElementSize = zpr_ty.ElementSize;
 }
 
-multiclass sve2_clamp<string asm, bit U, SDPatternOperator op,
-                      SDPatternOperator predicated_op = null_frag> {
+multiclass sve2_clamp<string asm, bit U, SDPatternOperator op> {
   def _B : sve2_clamp<asm, 0b00, U, ZPR8>;
   def _H : sve2_clamp<asm, 0b01, U, ZPR16>;
   def _S : sve2_clamp<asm, 0b10, U, ZPR32>;
@@ -1293,11 +1292,6 @@ multiclass sve2_clamp<string asm, bit U, SDPatternOperator op,
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
-
-  def : SVE_3_Op_Pat<nxv16i8, predicated_op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Pat<nxv8i16, predicated_op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4i32, predicated_op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i64, predicated_op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_perm_sel_p<string asm, PPRRegOp ppr_ty, Operand imm_ty>
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 1cd7018683f31c..4a163ecc611cfa 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9234,8 +9234,7 @@ class sve2p1_fclamp<string asm, bits<2> sz, ZPRRegOp zpr_ty>
   let hasSideEffects = 0;
 }
 
-multiclass sve2p1_fclamp<string asm, SDPatternOperator op,
-                          SDPatternOperator predicated_op = null_frag> {
+multiclass sve2p1_fclamp<string asm, SDPatternOperator op> {
   def _H : sve2p1_fclamp<asm, 0b01, ZPR16>;
   def _S : sve2p1_fclamp<asm, 0b10, ZPR32>;
   def _D : sve2p1_fclamp<asm, 0b11, ZPR64>;
@@ -9243,18 +9242,11 @@ multiclass sve2p1_fclamp<string asm, SDPatternOperator op,
   def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
-
-  def : SVE_3_Op_Pat<nxv8f16, predicated_op, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4f32, predicated_op, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2f64, predicated_op, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2p1_bfclamp<string asm, SDPatternOperator op,
-                          SDPatternOperator predicated_op = null_frag> {
+multiclass sve2p1_bfclamp<string asm, SDPatternOperator op> {
   def NAME : sve2p1_fclamp<asm, 0b00, ZPR16>;
     def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
-
-    def : SVE_3_Op_Pred_All_Active<nxv8bf16, predicated_op, nxv8bf16, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
 }
 
 // SVE two-way dot product

>From 214c320ba7652cc7f25b8b209664b4ccdb3529d0 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Tue, 19 Dec 2023 17:52:10 +0000
Subject: [PATCH 3/4] Use fminm/fmaxnm bcz they are the used ones in the pseudo
 code of the clamp instruction

Change-Id: If49a04c4eb806b92ddfaaa0ef035d74f0360db9d
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 21 +++++-----
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  8 +---
 .../CodeGen/AArch64/sve2-min-max-clamp.ll     | 38 +++++++++++--------
 3 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 63809eea7f52b9..547ac4a31d73fc 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -331,16 +331,19 @@ def AArch64uclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
                               ]>;
 def AArch64fclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
                               [(int_aarch64_sve_fclamp node:$Zd, node:$Zn, node:$Zm),
-                              (AArch64fmin_p (SVEAllActive),
-                                  (AArch64fmax_p (SVEAllActive), node:$Zd, node:$Zn),
+                              (int_aarch64_sve_fminnm (nxv8i1 (SVEAllActive)),
+                                  (nxv8bf16 (int_aarch64_sve_fmaxnm (nxv8i1 (SVEAllActive)), node:$Zd, node:$Zn)),
+                               node:$Zm),
+                              (int_aarch64_sve_fminnm (nxv8i1 (SVEAllActive)),
+                                  (nxv8f16 (int_aarch64_sve_fmaxnm (nxv8i1 (SVEAllActive)), node:$Zd, node:$Zn)),
+                                  node:$Zm),
+                              (int_aarch64_sve_fminnm (nxv4i1 (SVEAllActive)),
+                                  (nxv4f32 (int_aarch64_sve_fmaxnm (nxv4i1 (SVEAllActive)), node:$Zd, node:$Zn)),
+                                  node:$Zm),
+                              (int_aarch64_sve_fminnm (nxv2i1 (SVEAllActive)),
+                                  (nxv2f64 (int_aarch64_sve_fmaxnm (nxv2i1 (SVEAllActive)), node:$Zd, node:$Zn)),
                                   node:$Zm)
                                ]>;
-def AArch64bfclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
-                              [(int_aarch64_sve_fclamp node:$Zd, node:$Zn, node:$Zm),
-                              (int_aarch64_sve_fmin (nxv8i1 (SVEAllActive)),
-                                  (nxv8bf16 (int_aarch64_sve_fmax (nxv8i1 (SVEAllActive)), node:$Zd, node:$Zn)),
-                               node:$Zm)
-                               ]>;
 
 def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
@@ -4081,7 +4084,7 @@ defm BFMINNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fminnm_p>;
 
 defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul", int_aarch64_sve_fmul_lane>;
 
-defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", AArch64bfclamp>;
+defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", AArch64fclamp>;
 } // End HasSVE2p1_or_HasSME2p1, HasB16B16
 
 
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 4a163ecc611cfa..9edf26052247ae 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -470,12 +470,6 @@ class SVE_2_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op,
 : Pat<(vtd (op (pt (SVEAllActive)), vt1:$Op1, vt2:$Op2)),
       (inst $Op1, $Op2)>;
 
-class SVE_3_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op,
-                               ValueType vt1, ValueType pt, ValueType vt2,
-                               ValueType vt3, Instruction inst>
-: Pat<(vtd (op (pt (SVEAllActive)), vt1:$Op1, vt2:$Op2, vt3:$Op3)),
-      (inst $Op1, $Op2, $Op3)>;
-
 class SVE_2_Op_Pred_All_Active_Pt<ValueType vtd, SDPatternOperator op,
                                   ValueType pt, ValueType vt1, ValueType vt2,
                                   Instruction inst>
@@ -9246,7 +9240,7 @@ multiclass sve2p1_fclamp<string asm, SDPatternOperator op> {
 
 multiclass sve2p1_bfclamp<string asm, SDPatternOperator op> {
   def NAME : sve2p1_fclamp<asm, 0b00, ZPR16>;
-    def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
 }
 
 // SVE two-way dot product
diff --git a/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll b/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll
index 35e0b2ef3af05f..2eba93981ee9fd 100644
--- a/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll
@@ -89,8 +89,8 @@ define <vscale x 8 x bfloat> @fclampbf16(<vscale x 8 x bfloat> %a, <vscale x 8 x
 ; CHECK-NEXT:    bfclamp z0.h, z1.h, z2.h
 ; CHECK-NEXT:    ret
   %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-  %min = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1>%pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
-  %res = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1>%pg, <vscale x 8 x bfloat> %min, <vscale x 8 x bfloat> %c)
+  %min = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1>%pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  %res = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1>%pg, <vscale x 8 x bfloat> %min, <vscale x 8 x bfloat> %c)
   ret <vscale x 8 x bfloat> %res
 }
 
@@ -99,8 +99,9 @@ define <vscale x 8 x half> @fclampf16(<vscale x 8 x half> %a, <vscale x 8 x half
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fclamp z0.h, z1.h, z2.h
 ; CHECK-NEXT:    ret
-  %min = call <vscale x 8 x half> @llvm.maximum.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
-  %res = call <vscale x 8 x half> @llvm.minimum.nxv8f16(<vscale x 8 x half> %min, <vscale x 8 x half> %c)
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %min = call <vscale x 8 x half> @llvm.aarch64.sve.fmaxnm.nxv8f16(<vscale x 8 x i1>%pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  %res = call <vscale x 8 x half> @llvm.aarch64.sve.fminnm.nxv8f16(<vscale x 8 x i1>%pg, <vscale x 8 x half> %min, <vscale x 8 x half> %c)
   ret <vscale x 8 x half> %res
 }
 
@@ -109,8 +110,9 @@ define <vscale x 4 x float> @fclampf32(<vscale x 4 x float> %a, <vscale x 4 x fl
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fclamp z0.s, z1.s, z2.s
 ; CHECK-NEXT:    ret
-  %min = tail call <vscale x 4 x float> @llvm.maximum.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
-  %res = tail call <vscale x 4 x float> @llvm.minimum.nxv4f32(<vscale x 4 x float> %min, <vscale x 4 x float> %c)
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %min = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmaxnm.nxv4f32(<vscale x 4 x i1>%pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  %res = tail call <vscale x 4 x float> @llvm.aarch64.sve.fminnm.nxv4f32(<vscale x 4 x i1>%pg, <vscale x 4 x float> %min, <vscale x 4 x float> %c)
   ret <vscale x 4 x float> %res
 }
 
@@ -119,8 +121,9 @@ define <vscale x 2 x double> @fclampf64(<vscale x 2 x double> %a, <vscale x 2 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fclamp z0.d, z1.d, z2.d
 ; CHECK-NEXT:    ret
-  %min = tail call <vscale x 2 x double> @llvm.maximum.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
-  %res = tail call <vscale x 2 x double> @llvm.minimum.nxv2f64(<vscale x 2 x double> %min, <vscale x 2 x double> %c)
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %min = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.nxv2f64(<vscale x 2 x i1>%pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  %res = tail call <vscale x 2 x double> @llvm.aarch64.sve.fminnm.nxv2f64(<vscale x 2 x i1>%pg, <vscale x 2 x double> %min, <vscale x 2 x double> %c)
   ret <vscale x 2 x double> %res
 }
 
@@ -143,11 +146,14 @@ declare <vscale x 2 x i64> @llvm.smax.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x
 declare <vscale x 2 x i64> @llvm.smin.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
 declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
-declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
-declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmin.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
-declare <vscale x 8 x half>   @llvm.maximum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 8 x half>   @llvm.minimum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x float>  @llvm.maximum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x float>  @llvm.minimum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 2 x double> @llvm.maximum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 2 x double> @llvm.minimum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x half>   @llvm.aarch64.sve.fmaxnm.nxv8f16 (<vscale x 8 x i1>, <vscale x 8 x half>,   <vscale x 8 x half>)
+declare <vscale x 8 x half>   @llvm.aarch64.sve.fminnm.nxv8f16 (<vscale x 8 x i1>, <vscale x 8 x half>,   <vscale x 8 x half>)
+declare <vscale x 4 x float>  @llvm.aarch64.sve.fmaxnm.nxv4f32 (<vscale x 4 x i1>, <vscale x 4 x float>,  <vscale x 4 x float>)
+declare <vscale x 4 x float>  @llvm.aarch64.sve.fminnm.nxv4f32 (<vscale x 4 x i1>, <vscale x 4 x float>,  <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.nxv2f64 (<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fminnm.nxv2f64 (<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)

>From 7be487668564536649ef9d01163840d533961a03 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Tue, 19 Dec 2023 19:03:19 +0000
Subject: [PATCH 4/4] use the available generic llvm intrinsic for
 half/float/double types

Change-Id: Ic079645c42f040ac0f4ec094c6d25be846e77e6e
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 19 +++++-------
 .../CodeGen/AArch64/sve2-min-max-clamp.ll     | 29 ++++++++-----------
 2 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 547ac4a31d73fc..688b3078bca7dd 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -330,19 +330,16 @@ def AArch64uclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
                                   node:$Zm)
                               ]>;
 def AArch64fclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
+                              [(int_aarch64_sve_fclamp node:$Zd, node:$Zn, node:$Zm),
+                              (AArch64fminnm_p (SVEAllActive),
+                                  (AArch64fmaxnm_p (SVEAllActive), node:$Zd, node:$Zn),
+                               node:$Zm)
+                               ]>;
+def AArch64bfclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
                               [(int_aarch64_sve_fclamp node:$Zd, node:$Zn, node:$Zm),
                               (int_aarch64_sve_fminnm (nxv8i1 (SVEAllActive)),
                                   (nxv8bf16 (int_aarch64_sve_fmaxnm (nxv8i1 (SVEAllActive)), node:$Zd, node:$Zn)),
-                               node:$Zm),
-                              (int_aarch64_sve_fminnm (nxv8i1 (SVEAllActive)),
-                                  (nxv8f16 (int_aarch64_sve_fmaxnm (nxv8i1 (SVEAllActive)), node:$Zd, node:$Zn)),
-                                  node:$Zm),
-                              (int_aarch64_sve_fminnm (nxv4i1 (SVEAllActive)),
-                                  (nxv4f32 (int_aarch64_sve_fmaxnm (nxv4i1 (SVEAllActive)), node:$Zd, node:$Zn)),
-                                  node:$Zm),
-                              (int_aarch64_sve_fminnm (nxv2i1 (SVEAllActive)),
-                                  (nxv2f64 (int_aarch64_sve_fmaxnm (nxv2i1 (SVEAllActive)), node:$Zd, node:$Zn)),
-                                  node:$Zm)
+                               node:$Zm)
                                ]>;
 
 def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
@@ -4084,7 +4081,7 @@ defm BFMINNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fminnm_p>;
 
 defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul", int_aarch64_sve_fmul_lane>;
 
-defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", AArch64fclamp>;
+defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", AArch64bfclamp>;
 } // End HasSVE2p1_or_HasSME2p1, HasB16B16
 
 
diff --git a/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll b/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll
index 2eba93981ee9fd..e703e0c8d9dc96 100644
--- a/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll
@@ -99,9 +99,8 @@ define <vscale x 8 x half> @fclampf16(<vscale x 8 x half> %a, <vscale x 8 x half
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fclamp z0.h, z1.h, z2.h
 ; CHECK-NEXT:    ret
-  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-  %min = call <vscale x 8 x half> @llvm.aarch64.sve.fmaxnm.nxv8f16(<vscale x 8 x i1>%pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
-  %res = call <vscale x 8 x half> @llvm.aarch64.sve.fminnm.nxv8f16(<vscale x 8 x i1>%pg, <vscale x 8 x half> %min, <vscale x 8 x half> %c)
+  %min = call <vscale x 8 x half> @llvm.maxnum.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  %res = call <vscale x 8 x half> @llvm.minnum.nxv8f16(<vscale x 8 x half> %min, <vscale x 8 x half> %c)
   ret <vscale x 8 x half> %res
 }
 
@@ -110,9 +109,8 @@ define <vscale x 4 x float> @fclampf32(<vscale x 4 x float> %a, <vscale x 4 x fl
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fclamp z0.s, z1.s, z2.s
 ; CHECK-NEXT:    ret
-  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-  %min = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmaxnm.nxv4f32(<vscale x 4 x i1>%pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
-  %res = tail call <vscale x 4 x float> @llvm.aarch64.sve.fminnm.nxv4f32(<vscale x 4 x i1>%pg, <vscale x 4 x float> %min, <vscale x 4 x float> %c)
+  %min = tail call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  %res = tail call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %min, <vscale x 4 x float> %c)
   ret <vscale x 4 x float> %res
 }
 
@@ -121,9 +119,8 @@ define <vscale x 2 x double> @fclampf64(<vscale x 2 x double> %a, <vscale x 2 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fclamp z0.d, z1.d, z2.d
 ; CHECK-NEXT:    ret
-  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-  %min = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.nxv2f64(<vscale x 2 x i1>%pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
-  %res = tail call <vscale x 2 x double> @llvm.aarch64.sve.fminnm.nxv2f64(<vscale x 2 x i1>%pg, <vscale x 2 x double> %min, <vscale x 2 x double> %c)
+  %min = tail call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  %res = tail call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> %min, <vscale x 2 x double> %c)
   ret <vscale x 2 x double> %res
 }
 
@@ -146,14 +143,12 @@ declare <vscale x 2 x i64> @llvm.smax.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x
 declare <vscale x 2 x i64> @llvm.smin.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
 declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
-declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
-declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
 
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fmaxnm.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fminnm.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
-declare <vscale x 8 x half>   @llvm.aarch64.sve.fmaxnm.nxv8f16 (<vscale x 8 x i1>, <vscale x 8 x half>,   <vscale x 8 x half>)
-declare <vscale x 8 x half>   @llvm.aarch64.sve.fminnm.nxv8f16 (<vscale x 8 x i1>, <vscale x 8 x half>,   <vscale x 8 x half>)
-declare <vscale x 4 x float>  @llvm.aarch64.sve.fmaxnm.nxv4f32 (<vscale x 4 x i1>, <vscale x 4 x float>,  <vscale x 4 x float>)
-declare <vscale x 4 x float>  @llvm.aarch64.sve.fminnm.nxv4f32 (<vscale x 4 x i1>, <vscale x 4 x float>,  <vscale x 4 x float>)
-declare <vscale x 2 x double> @llvm.aarch64.sve.fmaxnm.nxv2f64 (<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 2 x double> @llvm.aarch64.sve.fminnm.nxv2f64 (<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 8 x half>   @llvm.maxnum.nxv8f16 (<vscale x 8 x half>,   <vscale x 8 x half>)
+declare <vscale x 8 x half>   @llvm.minnum.nxv8f16 (<vscale x 8 x half>,   <vscale x 8 x half>)
+declare <vscale x 4 x float>  @llvm.maxnum.nxv4f32 (<vscale x 4 x float>,  <vscale x 4 x float>)
+declare <vscale x 4 x float>  @llvm.minnum.nxv4f32 (<vscale x 4 x float>,  <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.maxnum.nxv2f64 (<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.minnum.nxv2f64 (<vscale x 2 x double>, <vscale x 2 x double>)