[llvm] [LLVM][AArch64][tblgen]: Match clamp pattern (PR #75529)

Tue Dec 19 02:56:21 PST 2023

================
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 < %s | FileCheck %s
+
+; Replace pattern min(max(v1,v2),v3) by clamp
+
+define <vscale x 16 x i8> @uclampi8(<vscale x 16 x i8> %c, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uclampi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uclamp z0.b, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+  %res = tail call <vscale x 16 x i8> @llvm.umin.nxv16i8(<vscale x 16 x i8> %min, <vscale x 16 x i8> %c)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @uclampi16(<vscale x 8 x i16> %c, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uclampi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uclamp z0.h, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 8 x i16> @llvm.umax.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = tail call <vscale x 8 x i16> @llvm.umin.nxv8i16(<vscale x 8 x i16> %min, <vscale x 8 x i16> %c)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 4 x i32> @uclampi32(<vscale x 4 x i32> %c, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uclampi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uclamp z0.s, z1.s, z2.s
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  %res = tail call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> %min, <vscale x 4 x i32> %c)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @uclampi64(<vscale x 2 x i64> %c, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: uclampi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uclamp z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 2 x i64> @llvm.umax.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
+  %res = tail call <vscale x 2 x i64> @llvm.umin.nxv2i64(<vscale x 2 x i64> %min, <vscale x 2 x i64> %c)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 16 x i8> @sclampi8(<vscale x 16 x i8> %c, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: sclampi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sclamp z0.b, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+  %res = tail call <vscale x 16 x i8> @llvm.smin.nxv16i8(<vscale x 16 x i8> %min, <vscale x 16 x i8> %c)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @sclampi16(<vscale x 8 x i16> %c, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sclampi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sclamp z0.h, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 8 x i16> @llvm.smax.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = tail call <vscale x 8 x i16> @llvm.smin.nxv8i16(<vscale x 8 x i16> %min, <vscale x 8 x i16> %c)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 4 x i32> @sclampi32(<vscale x 4 x i32> %c, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sclampi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sclamp z0.s, z1.s, z2.s
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  %res = tail call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %min, <vscale x 4 x i32> %c)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @sclampi64(<vscale x 2 x i64> %c, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sclampi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sclamp z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %min = tail call <vscale x 2 x i64> @llvm.smax.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
+  %res = tail call <vscale x 2 x i64> @llvm.smin.nxv2i64(<vscale x 2 x i64> %min, <vscale x 2 x i64> %c)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 8 x bfloat> @fclampbf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
+; CHECK-LABEL: fclampbf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfclamp z0.h, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %min = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fmax.nxv8bf16(<vscale x 8 x i1>%pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
----------------
paulwalker-arm wrote:

I agree with both @momchil-velikov and @CarolineConcatto.  I see no reason why we couldn't extend the existing intrinsics (including min/max) to cover bfloat16 types because it's just another floating point format much like the various 128-bit types.  However, I'd avoid doing that in this patch because the beauty of adding the PatFrag is that is should be easy to extend with other patterns later one.

Ultimately I think you're going to want to push the bfloat16 variants of the SVE and LLVM intrinsics down the same path as used for the other floating point types (i.e. lower to MIN(NM)/MAX(NM)_PRED ISD nodes).  At that point I think you might not need a PatFrag specifically for bfloat16 because you'll be able to infer all the types. As before though, I think this best done as follow on work as this patch is nicely contained as is.

https://github.com/llvm/llvm-project/pull/75529