[clang] [llvm] [AArch64] Implement intrinsics for SVE FAMIN/FAMAX (PR #99042)

Thu Aug 29 04:06:03 PDT 2024

================
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mattr=+sve2 < %s | FileCheck %s
+; RUN: llc -mattr=+sme2 -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define <vscale x 8 x half> @famin_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: famin_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+    %r = call <vscale x 8 x half> @llvm.aarch64.sve.famin.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+    ret <vscale x 8 x half> %r
+}
+
+define <vscale x 4 x float> @famin_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: famin_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+    %r = call <vscale x 4 x float> @llvm.aarch64.sve.famin.nxv4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+    ret <vscale x 4 x float> %r
+}
+
+define <vscale x 2 x double> @famin_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
+; CHECK-LABEL: famin_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+    %r = call <vscale x 2 x double> @llvm.aarch64.sve.famin.nxv2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+    ret <vscale x 2 x double> %r
+}
+
+define <vscale x 8 x half> @famin_u_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: famin_u_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+    %r = call <vscale x 8 x half> @llvm.aarch64.sve.famin.u.nxv8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
----------------
paulwalker-arm wrote:

Up to you but it might be worth switching `%a` and `%b` for the calls in the `_u_` functions.  The output should be unchanged, which shows the significance of the `.u.` in that it frees the register allocator to reuse the second operand.

https://github.com/llvm/llvm-project/pull/99042