[clang] [llvm] [AARCH64][SVE] Add intrinsics for SVE LUTI instructions (PR #97058)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Jul 1 01:22:43 PDT 2024
https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/97058
>From 4a6c4033f7deddcd4094ebde81402960de85bd80 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Fri, 28 Jun 2024 10:13:16 +0000
Subject: [PATCH] [AARCH64][SVE] Add intrinsics for SVE LUTI instructions
---
clang/include/clang/Basic/arm_sve.td | 21 +-
.../aarch64-sve2-intrinsics/acle_sve2_luti.c | 336 ++++++++++++++++++
.../acle_sve2_imm_lane.cpp | 32 ++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 20 ++
llvm/lib/Target/AArch64/SVEInstrFormats.td | 37 +-
.../CodeGen/AArch64/sve2-intrinsics-luti.ll | 107 ++++++
6 files changed, 551 insertions(+), 2 deletions(-)
create mode 100644 clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c
create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 94c093d891156..dc999a5bbb3d8 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1939,6 +1939,25 @@ def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u", "b", MergeNone, "", [VerifyRunti
def SVTBX_BF16 : SInst<"svtbx[_{d}]", "dddu", "b", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>;
}
+
+////////////////////////////////////////////////////////////////////////////////
+// SVE2 - Lookup table
+let SVETargetGuard = "sve2,lut", SMETargetGuard = "sme2,lut" in {
+ def SVLUTI2_B : SInst<"svluti2_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+ def SVLUTI2_H : SInst<"svluti2_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>;
+
+ def SVLUTI4_B : SInst<"svluti4_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+ def SVLUTI4_H : SInst<"svluti4_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+
+ def SVLUTI4_x2 : SInst<"svluti4_lane[_{d}]_x2", "d2.d[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+}
+
+let SVETargetGuard = "sve2,lut,bf16", SMETargetGuard = "sme2,lut,bf16" in {
+ def SVLUTI2_BF16 : SInst<"svluti2_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti2_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>;
+ def SVLUTI4_BF16 : SInst<"svluti4_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti4_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+ def SVLUTI4_BF16_x2 : SInst<"svluti4_lane[_{d}]_x2", "d2.d[i", "b", MergeNone, "aarch64_sve_luti4_lane_x2", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+}
+
////////////////////////////////////////////////////////////////////////////////
// SVE2 - Optional
@@ -2384,4 +2403,4 @@ let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
-}
+}
\ No newline at end of file
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c
new file mode 100644
index 0000000000000..d19246cba2d37
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c
@@ -0,0 +1,336 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
+// RUN: -target-feature +sme -target-feature +sme2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
+// RUN: -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
+// RUN: -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
+// RUN: -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+#include <arm_sve.h>
+
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// SME-CHECK-LABEL: @test_svluti2_lane_s8(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svluti2_lane_s8u10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svluti2_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti2_lane,_s8,)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_u8(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// SME-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svluti2_lane_u8u11__SVUint8_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svluti2_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti2_lane,_u8,)(table, indices, 3);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_s16(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_s16u11__SVInt16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svluti2_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti2_lane,_s16,)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_u16(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// SME-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_u16u12__SVUint16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svluti2_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti2_lane,_u16,)(table, indices, 7);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_f16(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// SME-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_f16u13__SVFloat16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svluti2_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti2_lane,_f16,)(table, indices, 5);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_bf16(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// SME-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z22test_svluti2_lane_bf16u14__SVBfloat16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svluti2_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti2_lane,_bf16,)(table, indices, 2);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_s8(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_s8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svluti4_lane_s8u10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svluti4_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti4_lane,_s8,)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_u8(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// SME-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svluti4_lane_u8u11__SVUint8_tS_(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svluti4_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti4_lane,_u8,)(table, indices, 1);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_s16(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_s16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_s16u11__SVInt16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svluti4_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti4_lane,_s16,)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_u16(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// SME-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_u16u12__SVUint16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svluti4_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti4_lane,_u16,)(table, indices, 3);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_f16(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// SME-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_f16u13__SVFloat16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svluti4_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti4_lane,_f16,)(table, indices, 2);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_bf16(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// SME-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z22test_svluti4_lane_bf16u14__SVBfloat16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svluti4_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti4_lane,_bf16,)(table, indices, 1);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_s16_x2(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// SME-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// SME-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+// CHECK-LABEL: @test_svluti4_lane_s16_x2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_s16_x211svint16x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+//
+svint16_t test_svluti4_lane_s16_x2(svint16x2_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti4_lane,_s16,_x2)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_u16_x2(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// SME-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// SME-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// SME-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+// CHECK-LABEL: @test_svluti4_lane_u16_x2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_u16_x212svuint16x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]]
+//
+svuint16_t test_svluti4_lane_u16_x2(svuint16x2_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti4_lane,_u16,_x2)(table, indices, 3);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_f16_x2(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE:%.*]], i64 0)
+// SME-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE]], i64 8)
+// SME-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// SME-CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]
+// CHECK-LABEL: @test_svluti4_lane_f16_x2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE]], i64 8)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]
+//
+svfloat16_t test_svluti4_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti4_lane,_f16,_x2)(table, indices, 2);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_bf16_x2(
+// SME-CHECK-NEXT: entry:
+// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE:%.*]], i64 0)
+// SME-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE]], i64 8)
+// SME-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// SME-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+// CHECK-LABEL: @test_svluti4_lane_bf16_x2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE:%.*]], i64 0)
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE]], i64 8)
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svluti4_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE:%.*]], i64 0)
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE]], i64 8)
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svluti4_lane_bf16_x2(svbfloat16x2_t table, svuint8_t indices) MODE_ATTR{
+ return SVE_ACLE_FUNC(svluti4_lane,_bf16,_x2)(table, indices, 1);
+}
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp
index bca063385420a..e405077b3de93 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp
@@ -78,6 +78,14 @@ void test_range_0_7()
SVE_ACLE_FUNC(svqrdmlsh_lane,_s16,,)(svundef_s16(), svundef_s16(), svundef_s16(), -1);
// expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
SVE_ACLE_FUNC(svqrdmulh_lane,_s16,,)(svundef_s16(), svundef_s16(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
+ SVE_ACLE_FUNC(svluti2_lane,_s16,,)(svundef_s16(), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
+ SVE_ACLE_FUNC(svluti2_lane,_u16,,)(svundef_u16(), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
+ SVE_ACLE_FUNC(svluti2_lane,_f16,,)(svundef_f16(), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
+ SVE_ACLE_FUNC(svluti2_lane,_bf16,,)(svundef_bf16(), svundef_u8(), -1);
}
void test_range_0_3()
@@ -146,6 +154,26 @@ void test_range_0_3()
SVE_ACLE_FUNC(svqdmullb_lane,_s64,,)(svundef_s32(), svundef_s32(), 4);
// expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
SVE_ACLE_FUNC(svqdmullt_lane,_s64,,)(svundef_s32(), svundef_s32(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+ SVE_ACLE_FUNC(svluti2_lane,_s8,,)(svundef_s8(), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+ SVE_ACLE_FUNC(svluti2_lane,_u8,,)(svundef_u8(), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+ SVE_ACLE_FUNC(svluti4_lane,_s16,,)(svundef_s16(), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+ SVE_ACLE_FUNC(svluti4_lane,_u16,,)(svundef_u16(), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+ SVE_ACLE_FUNC(svluti4_lane,_f16,,)(svundef_f16(), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+ SVE_ACLE_FUNC(svluti4_lane,_bf16,,)(svundef_bf16(), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+ SVE_ACLE_FUNC(svluti4_lane,_s16,_x2,)(svcreate2_s16(svundef_s16(),svundef_s16()), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+ SVE_ACLE_FUNC(svluti4_lane,_u16,_x2,)(svcreate2_u16(svundef_u16(),svundef_u16()), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+ SVE_ACLE_FUNC(svluti4_lane,_f16,_x2,)(svcreate2_f16(svundef_f16(),svundef_f16()), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+ SVE_ACLE_FUNC(svluti4_lane,_bf16,_x2,)(svcreate2_bf16(svundef_bf16(),svundef_bf16()), svundef_u8(), -1);
}
void test_range_0_1()
@@ -180,4 +208,8 @@ void test_range_0_1()
SVE_ACLE_FUNC(svqrdmlsh_lane,_s64,,)(svundef_s64(), svundef_s64(), svundef_s64(), 2);
// expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
SVE_ACLE_FUNC(svqrdmulh_lane,_s64,,)(svundef_s64(), svundef_s64(), 2);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+ SVE_ACLE_FUNC(svluti4_lane,_s8,,)(svundef_s8(), svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+ SVE_ACLE_FUNC(svluti4_lane,_u8,,)(svundef_u8(), svundef_u8(), -1);
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 38d71b17b476d..a443d6858b2a0 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1268,6 +1268,13 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
LLVMVectorOfBitcastsToInt<0>],
[IntrNoMem]>;
+ class SVE2_LUTI_Inrinsic
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>,
+ llvm_nxv16i8_ty,
+ llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
class SVE2_1VectorArg_Long_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMSubdivide2VectorType<0>,
@@ -2662,6 +2669,19 @@ def int_aarch64_sve_sm4ekey : ClangBuiltin<"__builtin_sve_svsm4ekey_u32">,
def int_aarch64_sve_tbl2 : AdvSIMD_SVE2_TBX_Intrinsic;
def int_aarch64_sve_tbx : AdvSIMD_SVE2_TBX_Intrinsic;
+//
+// SVE2 - Lookup Table
+//
+
+def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic;
+def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic;
+def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>,
+ LLVMMatchType<0>,
+ llvm_nxv16i8_ty,
+ llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+
//
// SVE2 - Optional bit permutation
//
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index fc7d3cdda4acd..c87d0746bc6a1 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10349,6 +10349,16 @@ multiclass sve2_luti2_vector_index<string mnemonic> {
let Inst{23-22} = idx{2-1};
let Inst{12} = idx{0};
}
+
+ def : SVE_3_Op_Imm_Pat<nxv16i8, int_aarch64_sve_luti2_lane, nxv16i8, nxv16i8,
+ i32, timm32_0_3, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Imm_Pat<nxv8i16, int_aarch64_sve_luti2_lane, nxv8i16, nxv16i8,
+ i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Imm_Pat<nxv8f16, int_aarch64_sve_luti2_lane, nxv8f16, nxv16i8,
+ i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti2_lane, nxv8bf16, nxv16i8,
+ i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
+
}
// FP8 Look up table read with 4-bit indices
@@ -10361,14 +10371,39 @@ multiclass sve2_luti4_vector_index<string mnemonic> {
bits<2> idx;
let Inst{23-22} = idx;
}
+
+ def : SVE_3_Op_Imm_Pat<nxv16i8, int_aarch64_sve_luti4_lane, nxv16i8, nxv16i8,
+ i32, timm32_0_1, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Imm_Pat<nxv8i16, int_aarch64_sve_luti4_lane, nxv8i16, nxv16i8,
+ i32, timm32_0_3, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Imm_Pat<nxv8f16, int_aarch64_sve_luti4_lane, nxv8f16, nxv16i8,
+ i32, timm32_0_3, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti4_lane, nxv8bf16, nxv16i8,
+ i32, timm32_0_3, !cast<Instruction>(NAME # _H)>;
}
// FP8 Look up table read with 4-bit indices (two contiguous registers)
multiclass sve2_luti4_vector_vg2_index<string mnemonic> {
- def _H : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexS32b, {?, 0b101}, mnemonic> {
+ def NAME : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexS32b, {?, 0b101}, mnemonic> {
bits<2> idx;
let Inst{23-22} = idx;
}
+
+ def : Pat<(nxv8i16 (int_aarch64_sve_luti4_lane_x2 nxv8i16:$Op1, nxv8i16:$Op2,
+ nxv16i8:$Op3, (i32 timm32_0_3:$Op4))),
+ (nxv8i16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
+ nxv8i16:$Op2, zsub1),
+ nxv16i8:$Op3, timm32_0_3:$Op4))>;
+ def : Pat<(nxv8f16 (int_aarch64_sve_luti4_lane_x2 nxv8f16:$Op1, nxv8f16:$Op2,
+ nxv16i8:$Op3, (i32 timm32_0_3:$Op4))),
+ (nxv8f16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
+ nxv8f16:$Op2, zsub1),
+ nxv16i8:$Op3, timm32_0_3:$Op4))>;
+ def : Pat<(nxv8bf16 (int_aarch64_sve_luti4_lane_x2 nxv8bf16:$Op1, nxv8bf16:$Op2,
+ nxv16i8:$Op3, (i32 timm32_0_3:$Op4))),
+ (nxv8bf16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0,
+ nxv8bf16:$Op2, zsub1),
+ nxv16i8:$Op3, timm32_0_3:$Op4))>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll
new file mode 100644
index 0000000000000..5cea7536e1f3c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+sve2,+lut,+bf16 | FileCheck %s
+
+define <vscale x 16 x i8> @test_luti2_lane_i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti2_lane_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti2 z0.b, { z0.b }, z1[0]
+; CHECK-NEXT: ret
+ %res= tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices, i32 0)
+ ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @test_luti2_lane_i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti2_lane_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti2 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT: ret
+ %res= tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices, i32 0)
+ ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x half> @test_luti2_lane_f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti2_lane_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti2 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT: ret
+ %res= tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.lane.nxv8f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices, i32 0)
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 8 x bfloat> @test_luti2_lane_bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti2_lane_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti2 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT: ret
+ %res= tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.lane.nxv8bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices, i32 0)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 16 x i8> @test_luti4_lane_i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti4 z0.b, { z0.b }, z1[0]
+; CHECK-NEXT: ret
+ %res= tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices, i32 0)
+ ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @test_luti4_lane_i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti4 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT: ret
+ %res= tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices, i32 0)
+ ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x half> @test_luti4_lane_f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti4 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT: ret
+ %res= tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.nxv8f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices, i32 0)
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 8 x bfloat> @test_luti4_lane_bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti4 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT: ret
+ %res= tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.nxv8bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices, i32 0)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x i16> @test_luti4_lane_i16_x2(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_i16_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: luti4 z0.h, { z2.h, z3.h }, z1[0]
+; CHECK-NEXT: ret
+ %res= tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> %table, <vscale x 8 x i16> %table, <vscale x 16 x i8> %indices, i32 0)
+ ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x half> @test_luti4_lane_f16_x2(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_f16_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: luti4 z0.h, { z2.h, z3.h }, z1[0]
+; CHECK-NEXT: ret
+ %res= tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> %table, <vscale x 8 x half> %table, <vscale x 16 x i8> %indices, i32 0)
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 8 x bfloat> @test_luti4_lane_bf16_x2(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_bf16_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: luti4 z0.h, { z2.h, z3.h }, z1[0]
+; CHECK-NEXT: ret
+ %res= tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> %table, <vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices, i32 0)
+ ret <vscale x 8 x bfloat> %res
+}
More information about the cfe-commits
mailing list