[clang] [llvm] [AARCH64][SVE] Add intrinsics for SVE LUTI instructions (PR #97058)

Thu Jul 4 06:04:09 PDT 2024

https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/97058

>From 4a6c4033f7deddcd4094ebde81402960de85bd80 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Fri, 28 Jun 2024 10:13:16 +0000
Subject: [PATCH 1/2] [AARCH64][SVE] Add intrinsics for SVE LUTI instructions

---
 clang/include/clang/Basic/arm_sve.td          |  21 +-
 .../aarch64-sve2-intrinsics/acle_sve2_luti.c  | 336 ++++++++++++++++++
 .../acle_sve2_imm_lane.cpp                    |  32 ++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  20 ++
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  37 +-
 .../CodeGen/AArch64/sve2-intrinsics-luti.ll   | 107 ++++++
 6 files changed, 551 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c
 create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 94c093d891156..dc999a5bbb3d8 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1939,6 +1939,25 @@ def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u",  "b", MergeNone, "", [VerifyRunti
 def SVTBX_BF16  : SInst<"svtbx[_{d}]",  "dddu", "b", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>;
 }
 
+
+////////////////////////////////////////////////////////////////////////////////
+// SVE2 - Lookup table
+let SVETargetGuard = "sve2,lut", SMETargetGuard = "sme2,lut" in {
+  def SVLUTI2_B : SInst<"svluti2_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI2_H : SInst<"svluti2_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>;
+
+  def SVLUTI4_B : SInst<"svluti4_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI4_H : SInst<"svluti4_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+
+  def SVLUTI4_x2 : SInst<"svluti4_lane[_{d}]_x2", "d2.d[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+}
+
+let SVETargetGuard = "sve2,lut,bf16", SMETargetGuard = "sme2,lut,bf16" in {
+  def SVLUTI2_BF16 : SInst<"svluti2_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti2_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>;
+  def SVLUTI4_BF16 : SInst<"svluti4_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti4_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI4_BF16_x2 : SInst<"svluti4_lane[_{d}]_x2", "d2.d[i", "b", MergeNone, "aarch64_sve_luti4_lane_x2", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Optional
 
@@ -2384,4 +2403,4 @@ let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
 
   def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
   def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
-}
+}
\ No newline at end of file
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c
new file mode 100644
index 0000000000000..d19246cba2d37
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c
@@ -0,0 +1,336 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
+// RUN:   -target-feature +sme -target-feature +sme2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
+// RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \
+// RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
+// RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+#include <arm_sve.h>
+
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// SME-CHECK-LABEL: @test_svluti2_lane_s8(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svluti2_lane_s8u10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svluti2_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti2_lane,_s8,)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_u8(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// SME-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svluti2_lane_u8u11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svluti2_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti2_lane,_u8,)(table, indices, 3);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_s16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_s16u11__SVInt16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svluti2_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti2_lane,_s16,)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_u16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// SME-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_u16u12__SVUint16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svluti2_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti2_lane,_u16,)(table, indices, 7);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_f16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// SME-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_f16u13__SVFloat16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svluti2_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti2_lane,_f16,)(table, indices, 5);
+}
+
+// SME-CHECK-LABEL: @test_svluti2_lane_bf16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// SME-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+// CHECK-LABEL: @test_svluti2_lane_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z22test_svluti2_lane_bf16u14__SVBfloat16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svluti2_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti2_lane,_bf16,)(table, indices, 2);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_s8(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svluti4_lane_s8u10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svluti4_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_s8,)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_u8(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// SME-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svluti4_lane_u8u11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svluti4_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_u8,)(table, indices, 1);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_s16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_s16u11__SVInt16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svluti4_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_s16,)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_u16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// SME-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_u16u12__SVUint16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svluti4_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_u16,)(table, indices, 3);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_f16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// SME-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_f16u13__SVFloat16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svluti4_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_f16,)(table, indices, 2);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_bf16(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// SME-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+// CHECK-LABEL: @test_svluti4_lane_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z22test_svluti4_lane_bf16u14__SVBfloat16_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svluti4_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_bf16,)(table, indices, 1);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_s16_x2(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// SME-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// SME-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// SME-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+// CHECK-LABEL: @test_svluti4_lane_s16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_s16_x211svint16x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+//
+svint16_t test_svluti4_lane_s16_x2(svint16x2_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_s16,_x2)(table, indices, 0);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_u16_x2(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// SME-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// SME-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7)
+// SME-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+// CHECK-LABEL: @test_svluti4_lane_u16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_u16_x212svuint16x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+//
+svuint16_t test_svluti4_lane_u16_x2(svuint16x2_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_u16,_x2)(table, indices, 3);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_f16_x2(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE:%.*]], i64 0)
+// SME-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE]], i64 8)
+// SME-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5)
+// SME-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+// CHECK-LABEL: @test_svluti4_lane_f16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+//
+svfloat16_t test_svluti4_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_f16,_x2)(table, indices, 2);
+}
+
+// SME-CHECK-LABEL: @test_svluti4_lane_bf16_x2(
+// SME-CHECK-NEXT:  entry:
+// SME-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE:%.*]], i64 0)
+// SME-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE]], i64 8)
+// SME-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2)
+// SME-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+// CHECK-LABEL: @test_svluti4_lane_bf16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z25test_svluti4_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+//
+svbfloat16_t test_svluti4_lane_bf16_x2(svbfloat16x2_t table, svuint8_t indices) MODE_ATTR{
+    return SVE_ACLE_FUNC(svluti4_lane,_bf16,_x2)(table, indices, 1);
+}
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp
index bca063385420a..e405077b3de93 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp
@@ -78,6 +78,14 @@ void test_range_0_7()
   SVE_ACLE_FUNC(svqrdmlsh_lane,_s16,,)(svundef_s16(), svundef_s16(), svundef_s16(), -1);
   // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
   SVE_ACLE_FUNC(svqrdmulh_lane,_s16,,)(svundef_s16(), svundef_s16(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
+  SVE_ACLE_FUNC(svluti2_lane,_s16,,)(svundef_s16(), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
+  SVE_ACLE_FUNC(svluti2_lane,_u16,,)(svundef_u16(), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
+  SVE_ACLE_FUNC(svluti2_lane,_f16,,)(svundef_f16(), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}}
+  SVE_ACLE_FUNC(svluti2_lane,_bf16,,)(svundef_bf16(), svundef_u8(), -1);
 }
 
 void test_range_0_3()
@@ -146,6 +154,26 @@ void test_range_0_3()
   SVE_ACLE_FUNC(svqdmullb_lane,_s64,,)(svundef_s32(), svundef_s32(), 4);
   // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
   SVE_ACLE_FUNC(svqdmullt_lane,_s64,,)(svundef_s32(), svundef_s32(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti2_lane,_s8,,)(svundef_s8(), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti2_lane,_u8,,)(svundef_u8(), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_s16,,)(svundef_s16(), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_u16,,)(svundef_u16(), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_f16,,)(svundef_f16(), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_bf16,,)(svundef_bf16(), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_s16,_x2,)(svcreate2_s16(svundef_s16(),svundef_s16()), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_u16,_x2,)(svcreate2_u16(svundef_u16(),svundef_u16()), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_f16,_x2,)(svcreate2_f16(svundef_f16(),svundef_f16()), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}}
+  SVE_ACLE_FUNC(svluti4_lane,_bf16,_x2,)(svcreate2_bf16(svundef_bf16(),svundef_bf16()), svundef_u8(), -1);
 }
 
 void test_range_0_1()
@@ -180,4 +208,8 @@ void test_range_0_1()
   SVE_ACLE_FUNC(svqrdmlsh_lane,_s64,,)(svundef_s64(), svundef_s64(), svundef_s64(), 2);
   // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
   SVE_ACLE_FUNC(svqrdmulh_lane,_s64,,)(svundef_s64(), svundef_s64(), 2);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+  SVE_ACLE_FUNC(svluti4_lane,_s8,,)(svundef_s8(), svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+  SVE_ACLE_FUNC(svluti4_lane,_u8,,)(svundef_u8(), svundef_u8(), -1);
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 38d71b17b476d..a443d6858b2a0 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1268,6 +1268,13 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMVectorOfBitcastsToInt<0>],
                 [IntrNoMem]>;
 
+  class SVE2_LUTI_Inrinsic
+    :  DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 llvm_nxv16i8_ty,
+                 llvm_i32_ty],
+                 [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
   class SVE2_1VectorArg_Long_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMSubdivide2VectorType<0>,
@@ -2662,6 +2669,19 @@ def int_aarch64_sve_sm4ekey : ClangBuiltin<"__builtin_sve_svsm4ekey_u32">,
 def int_aarch64_sve_tbl2 : AdvSIMD_SVE2_TBX_Intrinsic;
 def int_aarch64_sve_tbx  : AdvSIMD_SVE2_TBX_Intrinsic;
 
+//
+// SVE2 - Lookup Table
+//
+
+def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic;
+def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic;
+def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                    [LLVMMatchType<0>,
+                                    LLVMMatchType<0>,
+                                    llvm_nxv16i8_ty,
+                                    llvm_i32_ty],
+                                    [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+
 //
 // SVE2 - Optional bit permutation
 //
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index fc7d3cdda4acd..c87d0746bc6a1 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10349,6 +10349,16 @@ multiclass sve2_luti2_vector_index<string mnemonic> {
     let Inst{23-22} = idx{2-1};
     let Inst{12}    = idx{0};
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv16i8, int_aarch64_sve_luti2_lane, nxv16i8, nxv16i8, 
+                         i32, timm32_0_3, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, int_aarch64_sve_luti2_lane, nxv8i16, nxv16i8, 
+                         i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv8f16, int_aarch64_sve_luti2_lane, nxv8f16, nxv16i8, 
+                         i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti2_lane, nxv8bf16, nxv16i8, 
+                         i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
+
 }
 
 // FP8 Look up table read with 4-bit indices
@@ -10361,14 +10371,39 @@ multiclass sve2_luti4_vector_index<string mnemonic> {
     bits<2> idx;
     let Inst{23-22} = idx;
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv16i8, int_aarch64_sve_luti4_lane, nxv16i8, nxv16i8, 
+                        i32, timm32_0_1, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, int_aarch64_sve_luti4_lane, nxv8i16, nxv16i8, 
+                         i32, timm32_0_3, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv8f16, int_aarch64_sve_luti4_lane, nxv8f16, nxv16i8, 
+                         i32, timm32_0_3, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti4_lane, nxv8bf16, nxv16i8, 
+                         i32, timm32_0_3, !cast<Instruction>(NAME # _H)>;
 }
 
 // FP8 Look up table read with 4-bit indices (two contiguous registers)
 multiclass sve2_luti4_vector_vg2_index<string mnemonic> {
-  def _H : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexS32b, {?, 0b101}, mnemonic> {
+  def NAME : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexS32b, {?, 0b101}, mnemonic> {
     bits<2> idx;
     let Inst{23-22} = idx;
   }
+
+  def : Pat<(nxv8i16 (int_aarch64_sve_luti4_lane_x2 nxv8i16:$Op1, nxv8i16:$Op2, 
+                      nxv16i8:$Op3, (i32 timm32_0_3:$Op4))),
+            (nxv8i16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
+                                                                   nxv8i16:$Op2, zsub1),
+                                                nxv16i8:$Op3, timm32_0_3:$Op4))>;
+  def : Pat<(nxv8f16 (int_aarch64_sve_luti4_lane_x2 nxv8f16:$Op1, nxv8f16:$Op2, 
+                      nxv16i8:$Op3, (i32 timm32_0_3:$Op4))),
+            (nxv8f16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
+                                                                   nxv8f16:$Op2, zsub1),
+                                                nxv16i8:$Op3, timm32_0_3:$Op4))>;
+  def : Pat<(nxv8bf16 (int_aarch64_sve_luti4_lane_x2 nxv8bf16:$Op1, nxv8bf16:$Op2, 
+                      nxv16i8:$Op3, (i32 timm32_0_3:$Op4))),
+            (nxv8bf16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0,
+                                                                    nxv8bf16:$Op2, zsub1),
+                                                nxv16i8:$Op3, timm32_0_3:$Op4))>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll
new file mode 100644
index 0000000000000..5cea7536e1f3c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+sve2,+lut,+bf16 | FileCheck %s
+
+define <vscale x 16 x i8> @test_luti2_lane_i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti2_lane_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti2 z0.b, { z0.b }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 16 x i8>  %res
+}
+
+define <vscale x 8 x i16> @test_luti2_lane_i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti2_lane_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti2 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x i16>  %res
+}
+
+define <vscale x 8 x half> @test_luti2_lane_f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti2_lane_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti2 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.lane.nxv8f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x half>  %res
+}
+
+define <vscale x 8 x bfloat> @test_luti2_lane_bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti2_lane_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti2 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.lane.nxv8bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x bfloat>  %res
+}
+
+define <vscale x 16 x i8> @test_luti4_lane_i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti4 z0.b, { z0.b }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 16 x i8>  %res
+}
+
+define <vscale x 8 x i16> @test_luti4_lane_i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti4 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x i16>  %res
+}
+
+define <vscale x 8 x half> @test_luti4_lane_f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti4 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.nxv8f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x half>  %res
+}
+
+define <vscale x 8 x bfloat> @test_luti4_lane_bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti4 z0.h, { z0.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.nxv8bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x bfloat>  %res
+}
+
+define <vscale x 8 x i16> @test_luti4_lane_i16_x2(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_i16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    luti4 z0.h, { z2.h, z3.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> %table, <vscale x 8 x i16> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x i16>  %res
+}
+
+define <vscale x 8 x half> @test_luti4_lane_f16_x2(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_f16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    luti4 z0.h, { z2.h, z3.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> %table, <vscale x 8 x half> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x half>  %res
+}
+
+define <vscale x 8 x bfloat> @test_luti4_lane_bf16_x2(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices){
+; CHECK-LABEL: test_luti4_lane_bf16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    luti4 z0.h, { z2.h, z3.h }, z1[0]
+; CHECK-NEXT:    ret
+   %res= tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> %table, <vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices, i32 0)
+   ret  <vscale x 8 x bfloat>  %res
+}

>From 160a73b17e409aff972d98e3ddeab438828e88ae Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Thu, 4 Jul 2024 13:00:00 +0000
Subject: [PATCH 2/2] Fixes

---
 clang/include/clang/Basic/arm_sve.td                        | 1 -
 clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c | 1 +
 llvm/lib/Target/AArch64/SVEInstrFormats.td                  | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index dc999a5bbb3d8..d152b53c73860 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1939,7 +1939,6 @@ def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u",  "b", MergeNone, "", [VerifyRunti
 def SVTBX_BF16  : SInst<"svtbx[_{d}]",  "dddu", "b", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>;
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Lookup table
 let SVETargetGuard = "sve2,lut", SMETargetGuard = "sme2,lut" in {
diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c
index d19246cba2d37..60c4828c407e8 100644
--- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c
+++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c
@@ -8,6 +8,7 @@
 // RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \
 // RUN:   -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +lut -target-feature +bf16 -O1 -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index c87d0746bc6a1..71cab1d41b31d 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10358,7 +10358,6 @@ multiclass sve2_luti2_vector_index<string mnemonic> {
                          i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti2_lane, nxv8bf16, nxv16i8, 
                          i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
-
 }
 
 // FP8 Look up table read with 4-bit indices