[clang] [llvm] [AArch64][clang][llvm] Add ACLE Armv9.7 lookup table intrinsics (PR #187046)

Wed Jun 10 03:59:19 PDT 2026

https://github.com/jthackray updated https://github.com/llvm/llvm-project/pull/187046

>From 7c394fb3e627fcb5e98abbcebde1739a57d2d129 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Fri, 13 Mar 2026 15:35:37 +0000
Subject: [PATCH 01/22] [AArch64][clang][llvm] Add support for Armv9.7-A lookup
 table intrinsics

Add support for the following Armv9.7-A Lookup Table (lut)
instruction intrinsics:

SVE2.3
```c
  // Variant is  also available for: _u8 _mf8
  svint8_t svluti6[_s8](svint8x2_t table, svuint8_t indices);
```

SVE2.3 and SME2.3
``` c
  // Variants are also available for _u16_x2 and _f16_x2.
  svint16_t svluti6_lane[_s16_x2](svint16x2_t table, svuint8_t indices, uint64_t imm_idx);
```

SME2.3
```c
  // Variant are also available for: _u16, _f16 and _bf16.
  svint16x4_t svluti6_lane_s16_x4[_s16_x2](svint16x2_t table, svuint8x2_t indices, uint64_t imm_idx);

  // Variants are also available for: _u8 and _mf8.
  svint8x4_t svluti6_zt_s8_x4(uint64_t zt0, svuint8x3_t zn) __arm_streaming __arm_in("zt0");

  // Variants are also available for: _u8 and _mf8.
  svint8_t svluti6_zt_s8(uint64_t zt0, svuint8_t zn) __arm_streaming __arm_in("zt0");
```
---
 clang/include/clang/Basic/arm_sme.td          |   6 +
 clang/include/clang/Basic/arm_sve.td          |   9 +
 clang/lib/Basic/Targets/AArch64.cpp           |  29 +++
 clang/lib/Basic/Targets/AArch64.h             |   2 +
 .../sme2p3-intrinsics/acle_sme2p3_luti6.c     | 175 ++++++++++++++++++
 .../sve2p3-intrinsics/acle_sve2p3_luti6.c     | 112 +++++++++++
 .../Preprocessor/aarch64-target-features.c    |  23 +++
 .../acle_sme2p3_imm.c                         |  21 +++
 .../acle_sme2p3_target.c                      |  20 ++
 .../acle_sme2p3_target_lane.c                 |  16 ++
 .../acle_sve2p3_imm.cpp                       |  24 ++-
 .../acle_sve2p3_target.c                      |  19 ++
 .../acle_sve2p3_target_lane.c                 |  14 ++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  31 ++++
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 102 ++++++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  10 +-
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |   3 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  12 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  11 ++
 .../AArch64/sme2p3-intrinsics-luti6.ll        | 105 +++++++++++
 .../AArch64/sve2p3-intrinsics-luti6.ll        |  55 ++++++
 .../test/Verifier/AArch64/luti6-intrinsics.ll |  79 ++++++++
 22 files changed, 872 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
 create mode 100644 clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
 create mode 100644 clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c
 create mode 100644 clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
 create mode 100644 clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
 create mode 100644 clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c
 create mode 100644 clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
 create mode 100644 llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
 create mode 100644 llvm/test/Verifier/AArch64/luti6-intrinsics.ll

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 032c588966032..8de360fca5f5e 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -981,6 +981,12 @@ let SMETargetGuard = "sme-lutv2" in {
   def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
 }
 
+let SMETargetGuard = "sme2p3" in {
+  def SVLUTI6_ZT      : SInst<"svluti6_zt_{d}", "diu", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVLUTI6_ZT_X4   : SInst<"svluti6_zt_{d}_x4", "4i3.u", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVLUTI6_LANE_X4 : SInst<"svluti6_lane[_{d}_x4]", "42.d2.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
+}
+
 let SMETargetGuard = "sme-f8f32" in {
   def SVMOPA_FP8_ZA32 : Inst<"svmopa_za32[_mf8]_m", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za32",
                              [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<0, ImmCheck0_3>]>;
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 25f42cbcac64e..6d2bd0d30455f 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1919,6 +1919,15 @@ let SVETargetGuard = "(sve2|sme2),lut", SMETargetGuard = "sme2,lut" in {
   def SVLUTI4_x2 : SInst<"svluti4_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti4_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
 }
 
+let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
+  def SVLUTI6 : SInst<"svluti6[_{d}]", "d2u", "cUcm", MergeNone, "aarch64_sve_luti6", [IsOverloadNone]>;
+}
+
+let SVETargetGuard = "sve2p3", SMETargetGuard = "sme2p3" in {
+  def SVLUTI6_x2_I16 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUs", MergeNone, "aarch64_sve_luti6_lane_x2_i16", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI6_x2_F16 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "h", MergeNone, "aarch64_sve_luti6_lane_x2_f16", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Optional
 
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 9afe6cb10729d..ed07e1415b7f5 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -500,6 +500,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasSVE2p1)
     Builder.defineMacro("__ARM_FEATURE_SVE2p1", "1");
 
+  if (HasSVE2p3)
+    Builder.defineMacro("__ARM_FEATURE_SVE2p3", "1");
+
   if (HasSVE2 && HasSVEAES)
     Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1");
 
@@ -526,6 +529,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasSME2p1)
     Builder.defineMacro("__ARM_FEATURE_SME2p1", "1");
 
+  if (HasSME2p3)
+    Builder.defineMacro("__ARM_FEATURE_SME2p3", "1");
+
   if (HasSMEF16F16)
     Builder.defineMacro("__ARM_FEATURE_SME_F16F16", "1");
 
@@ -907,9 +913,11 @@ void AArch64TargetInfo::computeFeatureLookup() {
       .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3)
       .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4)
       .Case("sve2p1", FPU & SveMode && HasSVE2p1)
+      .Case("sve2p3", FPU & SveMode && HasSVE2p3)
       .Case("sme", HasSME)
       .Case("sme2", HasSME2)
       .Case("sme2p1", HasSME2p1)
+      .Case("sme2p3", HasSME2p3)
       .Case("sme-f64f64", HasSMEF64F64)
       .Case("sme-i16i64", HasSMEI16I64)
       .Case("sme-fa64", HasSMEFA64)
@@ -1015,6 +1023,15 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasSVE2 = true;
       HasSVE2p1 = true;
     }
+    if (Feature == "+sve2p3") {
+      FPU |= NeonMode;
+      FPU |= SveMode;
+      HasFullFP16 = true;
+      HasSVE2 = true;
+      HasSVE2p1 = true;
+      HasSVE2p2 = true;
+      HasSVE2p3 = true;
+    }
     if (Feature == "+sve-aes") {
       FPU |= NeonMode;
       HasFullFP16 = true;
@@ -1071,6 +1088,18 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasBFloat16 = true;
       HasFullFP16 = true;
     }
+    if (Feature == "+sme2p3") {
+      HasSME = true;
+      HasSME2 = true;
+      HasSVE2 = true;
+      HasSVE2p1 = true;
+      HasSVE2p2 = true;
+      HasSME2p1 = true;
+      HasSME2p2 = true;
+      HasSME2p3 = true;
+      HasBFloat16 = true;
+      HasFullFP16 = true;
+    }
     if (Feature == "+sme-f64f64") {
       HasSME = true;
       HasSMEF64F64 = true;
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 0a29bad81939b..b3c722a7f6d74 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -86,6 +86,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasBFloat16 = false;
   bool HasSVE2 = false;
   bool HasSVE2p1 = false;
+  bool HasSVE2p3 = false;
   bool HasSVEAES = false;
   bool HasSVE2SHA3 = false;
   bool HasSVE2SM4 = false;
@@ -111,6 +112,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasSMEF16F16 = false;
   bool HasSMEB16B16 = false;
   bool HasSME2p1 = false;
+  bool HasSME2p3 = false;
   bool HasFP8 = false;
   bool HasFP8FMA = false;
   bool HasFP8DOT2 = false;
diff --git a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
new file mode 100644
index 0000000000000..ae5fb1f64d0fc
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
@@ -0,0 +1,175 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sme.h>
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_s16_x4(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svluti6_lane_s16_x411svint16x2_t11svuint8x2_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
+//
+svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
+    __arm_streaming {
+  return svluti6_lane_s16_x4(table, indices, 1);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_u16_x4(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svluti6_lane_u16_x412svuint16x2_t11svuint8x2_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
+//
+svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
+    __arm_streaming {
+  return svluti6_lane_u16_x4(table, indices, 0);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti6_lane_f16_x4(
+// CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @_Z24test_svluti6_lane_f16_x413svfloat16x2_t11svuint8x2_t(
+// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
+//
+svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
+    __arm_streaming {
+  return svluti6_lane_f16_x4(table, indices, 1);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti6_lane_bf16_x4(
+// CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z25test_svluti6_lane_bf16_x414svbfloat16x2_t11svuint8x2_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x4_t test_svluti6_lane_bf16_x4(svbfloat16x2_t table, svuint8x2_t indices)
+    __arm_streaming {
+  return svluti6_lane_bf16_x4(table, indices, 0);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_s8(
+// CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z18test_svluti6_zt_s8u11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svluti6_zt_s8(svuint8_t indices) __arm_streaming __arm_in("zt0") {
+  return svluti6_zt_s8(0, indices);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_u8(
+// CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z18test_svluti6_zt_u8u11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svluti6_zt_u8(svuint8_t indices) __arm_streaming __arm_in("zt0") {
+  return svluti6_zt_u8(0, indices);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z19test_svluti6_zt_mf8u11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svluti6_zt_mf8(svuint8_t indices) __arm_streaming __arm_in("zt0") {
+  return svluti6_zt_mf8(0, indices);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svluti6_zt_u8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]])
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z21test_svluti6_zt_u8_x411svuint8x3_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]])
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svluti6_zt_u8_x4(svuint8x3_t indices)
+    __arm_streaming __arm_in("zt0") {
+  return svluti6_zt_u8_x4(0, indices);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svluti6_zt_s8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]])
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z21test_svluti6_zt_s8_x411svuint8x3_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]])
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svint8x4_t test_svluti6_zt_s8_x4(svuint8x3_t indices)
+    __arm_streaming __arm_in("zt0") {
+  return svluti6_zt_s8_x4(0, indices);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svluti6_zt_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]])
+// CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z22test_svluti6_zt_mf8_x411svuint8x3_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]])
+// CPP-CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svmfloat8x4_t test_svluti6_zt_mf8_x4(svuint8x3_t indices)
+    __arm_streaming __arm_in("zt0") {
+  return svluti6_zt_mf8_x4(0, indices);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
new file mode 100644
index 0000000000000..a806ef0b13c20
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
@@ -0,0 +1,112 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1, A2) A1##A2
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8(
+// CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_s810svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svluti6_s8(svint8x2_t table, svuint8_t indices) {
+  return SVE_ACLE_FUNC(svluti6, _s8)(table, indices);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_u8(
+// CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_u811svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svluti6_u8(svuint8x2_t table, svuint8_t indices) {
+  return SVE_ACLE_FUNC(svluti6, _u8)(table, indices);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svluti6_mf813svmfloat8x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svluti6_mf8(svmfloat8x2_t table, svuint8_t indices) {
+  return SVE_ACLE_FUNC(svluti6, _mf8)(table, indices);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) {
+  return SVE_ACLE_FUNC(svluti6_lane, _s16_x2)(table, indices, 1);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) {
+  return SVE_ACLE_FUNC(svluti6_lane, _u16_x2)(table, indices, 0);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
+// CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) {
+  return SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(table, indices, 1);
+}
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 60ddaad639d48..6316b25befed8 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -827,9 +827,32 @@
 // CHECK-SVE2p2: __ARM_NEON_FP 0xE
 // CHECK-SVE2p2: __ARM_NEON_SVE_BRIDGE 1
 //
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv9.7-a+sve2p3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2p3 %s
+// CHECK-SVE2p3: __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
+// CHECK-SVE2p3: __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
+// CHECK-SVE2p3: __ARM_FEATURE_SVE 1
+// CHECK-SVE2p3: __ARM_FEATURE_SVE2 1
+// CHECK-SVE2p3: __ARM_FEATURE_SVE2p1 1
+// CHECK-SVE2p3: __ARM_FEATURE_SVE2p2 1
+// CHECK-SVE2p3: __ARM_FEATURE_SVE2p3 1
+// CHECK-SVE2p3: __ARM_NEON 1
+// CHECK-SVE2p3: __ARM_NEON_FP 0xE
+// CHECK-SVE2p3: __ARM_NEON_SVE_BRIDGE 1
+// CHECK-SVE2p3-NOT: __ARM_FEATURE_SME2p3 1
+//
 // RUN: %clang --target=aarch64 -march=armv9-a+sme2p2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p2 %s
 // CHECK-SME2p2: __ARM_FEATURE_LOCALLY_STREAMING 1
 // CHECK-SME2p2: __ARM_FEATURE_SME 1
 // CHECK-SME2p2: __ARM_FEATURE_SME2 1
 // CHECK-SME2p2: __ARM_FEATURE_SME2p1 1
 // CHECK-SME2p2: __ARM_FEATURE_SME2p2 1
+//
+// RUN: %clang --target=aarch64 -march=armv9.7-a+sme2p3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p3 %s
+// CHECK-SME2p3: __ARM_FEATURE_LOCALLY_STREAMING 1
+// CHECK-SME2p3: __ARM_FEATURE_SME 1
+// CHECK-SME2p3: __ARM_FEATURE_SME2 1
+// CHECK-SME2p3: __ARM_FEATURE_SME2p1 1
+// CHECK-SME2p3: __ARM_FEATURE_SME2p2 1
+// CHECK-SME2p3: __ARM_FEATURE_SME2p3 1
+// CHECK-SME2p3: __ARM_FEATURE_SVE2p1 1
+// CHECK-SME2p3: __ARM_FEATURE_SVE2p2 1
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c
new file mode 100644
index 0000000000000..8883ea3580fb2
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c
@@ -0,0 +1,21 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -fsyntax-only -verify %s
+
+#include <arm_sme.h>
+
+void test_range_0_0(void) __arm_streaming __arm_in("zt0") {
+  svluti6_zt_s8(1, svundef_u8()); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
+  svluti6_zt_u8_x4(1, svcreate3_u8(svundef_u8(), svundef_u8(), svundef_u8())); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
+}
+
+void test_range_0_1(void) __arm_streaming {
+  svluti6_lane_s16_x4(svcreate2_s16(svundef_s16(), svundef_s16()), // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+                      svcreate2_u8(svundef_u8(), svundef_u8()), -1);
+  svluti6_lane_u16_x4(svcreate2_u16(svundef_u16(), svundef_u16()), // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+                      svcreate2_u8(svundef_u8(), svundef_u8()), 2);
+  svluti6_lane_f16_x4(svcreate2_f16(svundef_f16(), svundef_f16()), // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+                      svcreate2_u8(svundef_u8(), svundef_u8()), -1);
+  svluti6_lane_bf16_x4(svcreate2_bf16(svundef_bf16(), svundef_bf16()), // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+                       svcreate2_u8(svundef_u8(), svundef_u8()), 2);
+}
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
new file mode 100644
index 0000000000000..2cffc1344bfe1
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
@@ -0,0 +1,20 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +bf16 -verify -emit-llvm -o - %s
+
+#include <arm_sme.h>
+
+svint8_t missing_sme2p3_zt(svuint8_t indices) __arm_streaming __arm_in("zt0") {
+  return svluti6_zt_s8(0, indices); // expected-error {{'svluti6_zt_s8' needs target feature sme,sme2p3}}
+}
+
+__attribute__((target("sme2p3")))
+svint8_t has_sme2p3_zt(svuint8_t indices) __arm_streaming __arm_in("zt0") {
+  return svluti6_zt_s8(0, indices);
+}
+
+__attribute__((target("sme2p3")))
+svfloat16_t has_sme2p3_implied_sme2p2(svbool_t pg, svfloat16_t op)
+    __arm_streaming {
+  return svcompact_f16(pg, op);
+}
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
new file mode 100644
index 0000000000000..1a06663a9aab7
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
@@ -0,0 +1,16 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +bf16 -verify -emit-llvm -o - %s
+
+#include <arm_sme.h>
+
+svbfloat16x4_t missing_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
+    __arm_streaming {
+  return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature sme,sme2p3}}
+}
+
+__attribute__((target("sme2p3,bf16")))
+svbfloat16x4_t has_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
+    __arm_streaming {
+  return svluti6_lane_bf16_x4(table, indices, 0);
+}
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp
index 60183e346f181..c284276f60e77 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp
@@ -1,10 +1,15 @@
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +sve2p3 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -fsyntax-only -verify %s
 
 #include <arm_sve.h>
 
-
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1, A2) A1##A2
+#endif
 
 svint8_t test_svqshrn_n_s8_s16_x2(svint16x2_t zn, uint64_t imm)
 {
@@ -83,7 +88,20 @@ svuint8_t test_svqrshrun_n_u8_s16_x2(svint16x2_t zn, uint64_t imm)
   svqrshrun_n_u8_s16_x2(zn, 9); // expected-error-re {{argument value {{[0-9]+}} is outside the valid range [1, 8]}}
   svqrshrun_n_u8_s16_x2(zn, -1); // expected-error-re {{argument value {{[0-9]+}} is outside the valid range [1, 8]}}
 
-  svqrshrun_n_u8_s16_x2(zn, imm); // expected-error-re {{argument to {{.+}} must be a constant integer}}}}
+  svqrshrun_n_u8_s16_x2(zn, imm); // expected-error-re {{argument to {{.+}} must be a constant integer}}}
+}
+
+
+void test_range_0_1() {
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+  SVE_ACLE_FUNC(svluti6_lane, _s16_x2)(svcreate2_s16(svundef_s16(), svundef_s16()),
+                                        svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+  SVE_ACLE_FUNC(svluti6_lane, _u16_x2)(svcreate2_u16(svundef_u16(), svundef_u16()),
+                                        svundef_u8(), 2);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+  SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(svcreate2_f16(svundef_f16(), svundef_f16()),
+                                        svundef_u8(), -1);
 }
 
 void test_svdot_lane_x2_imm_0_7(svint16_t s16, svuint16_t u16, svint8_t s8,
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c
new file mode 100644
index 0000000000000..3b5596ac1d5a6
--- /dev/null
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c
@@ -0,0 +1,19 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -emit-llvm -o - %s
+
+#include <arm_sve.h>
+
+void missing_sve2p3_luti6(svint8x2_t table, svuint8_t indices) {
+  svluti6_s8(table, indices); // expected-error {{'svluti6_s8' needs target feature sve,sve2p3}}
+}
+
+__attribute__((target("sve2p3")))
+svint8_t has_sve2p3_luti6(svint8x2_t table, svuint8_t indices) {
+  return svluti6_s8(table, indices);
+}
+
+__attribute__((target("sve2p3")))
+svfloat32_t has_sve2p3_implied_sve2p2(svbool_t pg, svfloat16_t op) {
+  return svcvtlt_f32_f16_z(pg, op);
+}
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
new file mode 100644
index 0000000000000..6a2465f4027fc
--- /dev/null
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
@@ -0,0 +1,14 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -emit-llvm -o - %s
+
+#include <arm_sve.h>
+
+svfloat16_t missing_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
+  return svluti6_lane_f16_x2(table, indices, 1); // expected-error {{'svluti6_lane_f16_x2' needs target feature (sve,sve2p3)|(sme,sme2p3)}}
+}
+
+__attribute__((target("sve2p3")))
+svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
+  return svluti6_lane_f16_x2(table, indices, 0);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index ba0d7c02bf427..53c9dcca1deba 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2815,12 +2815,31 @@ def int_aarch64_sve_tbx  : AdvSIMD_SVE2_TBX_Intrinsic<[IntrSpeculatable]>;
 
 def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic<[IntrSpeculatable]>;
+def int_aarch64_sve_luti6 : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
+                                  [llvm_nxv16i8_ty,
+                                   llvm_nxv16i8_ty,
+                                   llvm_nxv16i8_ty],
+                                  [IntrNoMem, IntrSpeculatable]>;
 def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                     [LLVMMatchType<0>,
                                     LLVMMatchType<0>,
                                     llvm_nxv16i8_ty,
                                     llvm_i32_ty],
                                     [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
+def int_aarch64_sve_luti6_lane_x2_i16
+    : DefaultAttrsIntrinsic<[llvm_nxv8i16_ty],
+                            [llvm_nxv8i16_ty,
+                             llvm_nxv8i16_ty,
+                             llvm_nxv16i8_ty,
+                             llvm_i32_ty],
+                            [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
+def int_aarch64_sve_luti6_lane_x2_f16
+    : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty],
+                            [llvm_nxv8f16_ty,
+                             llvm_nxv8f16_ty,
+                             llvm_nxv16i8_ty,
+                             llvm_i32_ty],
+                            [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
 
 //
 // SVE2 - Optional bit permutation
@@ -3980,6 +3999,9 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_luti4_lane_zt
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
                             [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrInaccessibleMemOnly, IntrReadMem]>;
+  def int_aarch64_sme_luti6_zt
+    : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty], [llvm_i32_ty, llvm_nxv16i8_ty],
+                            [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrReadMem]>;
 
   // Lookup table expand two registers
   //
@@ -4001,11 +4023,20 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                             [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
                             [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrInaccessibleMemOnly, IntrReadMem]>;
+  def int_aarch64_sme_luti6_lane_x4
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
+                            [ImmArg<ArgIndex<4>>, IntrNoMem, IntrSpeculatable]>;
 
   def int_aarch64_sme_luti4_zt_x4
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                             [llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
                             [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrReadMem]>;
+  def int_aarch64_sme_luti6_zt_x4
+    : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty, llvm_nxv16i8_ty,
+                             llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+                            [llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+                            [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrReadMem]>;
 
 
   //
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 499bb2325186d..1f6f944b761d4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -415,8 +415,12 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
 
   void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
                                  unsigned Opc, uint32_t MaxImm);
+  void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs,
+                                      unsigned Opc, uint32_t MaxImm);
 
   void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc);
+  void SelectMultiVectorLutiZT(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
+                               unsigned NumInVecs);
 
   template <unsigned MaxIdx, unsigned Scale>
   bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
@@ -2274,6 +2278,51 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
   CurDAG->RemoveDeadNode(Node);
 }
 
+void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
+                                                         unsigned NumOutVecs,
+                                                         unsigned Opc,
+                                                         uint32_t MaxImm) {
+  const bool HasChain = Node->getOpcode() == ISD::INTRINSIC_W_CHAIN;
+  const unsigned BaseOp = HasChain ? 1 : 0;
+  const unsigned t0 = BaseOp + 1;
+  const unsigned t1 = BaseOp + 2;
+  const unsigned i0 = BaseOp + 3;
+  const unsigned i1 = BaseOp + 4;
+  const unsigned ImmOp = BaseOp + 5;
+
+  SDValue ImmVal = Node->getOperand(ImmOp);
+  if (auto *Imm = dyn_cast<ConstantSDNode>(ImmVal))
+    if (Imm->getZExtValue() > MaxImm)
+      return;
+
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+  SmallVector<SDValue, 4> Ops = {
+      createZTuple({Node->getOperand(t0), Node->getOperand(t1)}),
+      createZTuple({Node->getOperand(i0), Node->getOperand(i1)}),
+      Node->getOperand(ImmOp),
+  };
+
+  SDNode *Instruction;
+  if (HasChain) {
+    Ops.push_back(Node->getOperand(0));
+    Instruction =
+        CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
+  } else {
+    Instruction = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops);
+  }
+  SDValue SuperReg(Instruction, 0);
+
+  for (unsigned i = 0; i < NumOutVecs; ++i)
+    ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg(
+                                      AArch64::zsub0 + i, DL, VT, SuperReg));
+
+  if (HasChain)
+    ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1));
+
+  CurDAG->RemoveDeadNode(Node);
+}
+
 void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
                                                 unsigned NumOutVecs,
                                                 unsigned Opc) {
@@ -2303,6 +2352,50 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
   CurDAG->RemoveDeadNode(Node);
 }
 
+void AArch64DAGToDAGISel::SelectMultiVectorLutiZT(SDNode *Node,
+                                                  unsigned NumOutVecs,
+                                                  unsigned Opc,
+                                                  unsigned NumInVecs) {
+  const unsigned ChainOp = 0;
+  const unsigned ZtOp = 2;
+  const unsigned FirstVecOp = 3;
+
+  SDValue ZtValue;
+  if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(ZtOp), ZtValue))
+    return;
+
+  SDValue ZTuple;
+  switch (NumInVecs) {
+  case 2:
+    ZTuple = createZMulTuple(
+        {Node->getOperand(FirstVecOp), Node->getOperand(FirstVecOp + 1)});
+    break;
+  case 3:
+    ZTuple = createZTuple({Node->getOperand(FirstVecOp),
+                           Node->getOperand(FirstVecOp + 1),
+                           Node->getOperand(FirstVecOp + 2)});
+    break;
+  default:
+    llvm_unreachable("unexpected LUTI ZT tuple width");
+  }
+
+  SDValue Ops[] = {ZtValue, ZTuple, Node->getOperand(ChainOp)};
+
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+
+  SDNode *Instruction =
+      CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
+  SDValue SuperReg(Instruction, 0);
+
+  for (unsigned i = 0; i < NumOutVecs; ++i)
+    ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg(
+                                      AArch64::zsub0 + i, DL, VT, SuperReg));
+
+  ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1));
+  CurDAG->RemoveDeadNode(Node);
+}
+
 void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs,
                                       unsigned Op) {
   SDLoc DL(N);
@@ -5993,6 +6086,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z);
       return;
     }
+    case Intrinsic::aarch64_sme_luti6_zt_x4: {
+      SelectMultiVectorLutiZT(Node, 4, AArch64::LUTI6_4ZT3Z, 3);
+      return;
+    }
     case Intrinsic::aarch64_sve_fp8_cvtl1_x2:
       if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::FP>(
               Node->getValueType(0),
@@ -6083,6 +6180,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                AArch64::SRSHL_VG4_4ZZ_S, AArch64::SRSHL_VG4_4ZZ_D}))
         SelectDestructiveMultiIntrinsic(Node, 4, false, Op);
       return;
+    case Intrinsic::aarch64_sme_luti6_lane_x4:
+      if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
+              Node->getValueType(0), {0, AArch64::LUTI6_4Z2Z2ZI, 0}))
+        SelectMultiVectorLutiLaneTuple(Node, 4, Opc, 1);
+      return;
     case Intrinsic::aarch64_sve_urshl_single_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
               Node->getValueType(0),
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 200808665c93e..09e6b3e6879ea 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -229,7 +229,7 @@ def HasF16MM         : Predicate<"Subtarget->hasF16MM()">,
                                  AssemblerPredicateWithAll<(all_of FeatureF16MM), "f16mm">;
 def HasSVE2p3        : Predicate<"Subtarget->hasSVE2p3()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">;
-def HasSME2p3        : Predicate<"Subtarget->hasSME2p3()">,
+def HasSME2p3        : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p3()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME2p3), "sme2p3">;
 def HasF16F32DOT     : Predicate<"Subtarget->hasF16F32DOT()">,
                                  AssemblerPredicateWithAll<(all_of FeatureF16F32DOT), "f16f32dot">;
@@ -313,6 +313,14 @@ def HasNonStreamingSVE2p2_or_SME2p2
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2),
                 "sme2p2 or sve2p2">;
+def HasNonStreamingSVE2p3
+    : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p3()">,
+                AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">;
+def HasNonStreamingSVE2p3_or_SME2p3
+    : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p3()) ||"
+                "(Subtarget->isStreaming() && Subtarget->hasSME2p3())">,
+                AssemblerPredicateWithAll<(any_of FeatureSVE2p3, FeatureSME2p3),
+                "sme2p3 or sve2p3">;
 
 def HasSMEF16F16_or_SMEF8F16
     : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">,
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 022fed6473486..9afeae7c25de1 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -1141,6 +1141,9 @@ let Predicates = [HasSME_MOP4, HasSMEF64F64] in {
 //===----------------------------------------------------------------------===//
 let Predicates = [HasSME2p3] in {
   def LUTI6_ZTZ       : sme2_lut_single<"luti6">;
+  def : Pat<(nxv16i8 (int_aarch64_sme_luti6_zt (imm_to_zt untyped:$zt),
+                      nxv16i8:$zn)),
+            (LUTI6_ZTZ $zt, nxv16i8:$zn)>;
   def LUTI6_4ZT3Z     : sme2_luti6_zt_consecutive<"luti6">;
   def LUTI6_S_4ZT3Z   : sme2_luti6_zt_strided<"luti6">;
   def LUTI6_4Z2Z2ZI   : sme2_luti6_vector_vg4_consecutive<"luti6">;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 0cc788d12bae0..a5856bbe24d1f 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4897,14 +4897,22 @@ let Predicates = [HasSVE2p3_or_SME2p3] in {
   defm SQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"sqshrn",   0b000, int_aarch64_sve_sqshrn_x2>;
   defm UQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"uqshrn",   0b010, int_aarch64_sve_uqshrn_x2>;
 
-  defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">;
 } // End HasSME2p3orSVE2p3
 
+let Predicates = [HasNonStreamingSVE2p3_or_SME2p3] in {
+  defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">;
+}
+
 //===----------------------------------------------------------------------===//
 // SVE2.3 instructions
 //===----------------------------------------------------------------------===//
-let Predicates = [HasSVE2p3] in {
+let Predicates = [HasNonStreamingSVE2p3] in {
   def LUTI6_Z2ZZ : sve2_luti6_vector<"luti6">;
+  def : Pat<(nxv16i8 (int_aarch64_sve_luti6 nxv16i8:$Op1, nxv16i8:$Op2,
+                      nxv16i8:$Op3)),
+            (LUTI6_Z2ZZ (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0,
+                                           nxv16i8:$Op2, zsub1),
+                         nxv16i8:$Op3)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 040962e801604..1619bfe41117f 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11420,6 +11420,17 @@ multiclass sve2_luti6_vector_index<string mnemonic> {
     bit idx;
     let Inst{23} = idx;
   }
+
+  def : Pat<(nxv8i16 (int_aarch64_sve_luti6_lane_x2_i16 nxv8i16:$Op1, nxv8i16:$Op2,
+                      nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
+            (nxv8i16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
+                                                                      nxv8i16:$Op2, zsub1),
+                                                nxv16i8:$Op3, timm32_0_1:$Op4))>;
+  def : Pat<(nxv8f16 (int_aarch64_sve_luti6_lane_x2_f16 nxv8f16:$Op1, nxv8f16:$Op2,
+                      nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
+            (nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
+                                                                      nxv8f16:$Op2, zsub1),
+                                                nxv16i8:$Op3, timm32_0_1:$Op4))>;
 }
 
 // Look up table
diff --git a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
new file mode 100644
index 0000000000000..07fb62baa58cd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -force-streaming -mtriple=aarch64-none-linux-gnu -mattr=+sme2p3 < %s | FileCheck %s
+
+target triple = "aarch64-none-linux-gnu"
+
+define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 {
+; CHECK-LABEL: luti6_zt_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti6 z0.b, zt0, z0
+; CHECK-NEXT:    ret
+  %res = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(
+      i32 0, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
+         <vscale x 16 x i8> } @luti6_zt_i8_x4(<vscale x 16 x i8> %a,
+                                              <vscale x 16 x i8> %b,
+                                              <vscale x 16 x i8> %c) #0 {
+; CHECK-LABEL: luti6_zt_i8_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti6 { z0.b - z3.b }, zt0, { z0 - z2 }
+; CHECK-NEXT:    ret
+  %res = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>,
+                     <vscale x 16 x i8>, <vscale x 16 x i8> }
+      @llvm.aarch64.sme.luti6.zt.x4(
+          i32 0, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
+          <vscale x 16 x i8> %c)
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
+        <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+         <vscale x 8 x i16> } @luti6_i16_x4(<vscale x 8 x i16> %a,
+                                            <vscale x 8 x i16> %b,
+                                            <vscale x 16 x i8> %x,
+                                            <vscale x 16 x i8> %y) #0 {
+; CHECK-LABEL: luti6_i16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[1]
+; CHECK-NEXT:    ret
+  %res = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>,
+                     <vscale x 8 x i16>, <vscale x 8 x i16> }
+      @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
+          <vscale x 8 x i16> %a, <vscale x 8 x i16> %b,
+          <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+        <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+         <vscale x 8 x bfloat> } @luti6_bf16_x4(<vscale x 8 x bfloat> %a,
+                                                <vscale x 8 x bfloat> %b,
+                                                <vscale x 16 x i8> %x,
+                                                <vscale x 16 x i8> %y) #0 {
+; CHECK-LABEL: luti6_bf16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[0]
+; CHECK-NEXT:    ret
+  %res = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+                     <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }
+      @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(
+          <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b,
+          <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 0)
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+        <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
+         <vscale x 8 x half> } @luti6_f16_x4(<vscale x 8 x half> %a,
+                                             <vscale x 8 x half> %b,
+                                             <vscale x 16 x i8> %x,
+                                             <vscale x 16 x i8> %y) #0 {
+; CHECK-LABEL: luti6_f16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[1]
+; CHECK-NEXT:    ret
+  %res = tail call { <vscale x 8 x half>, <vscale x 8 x half>,
+                     <vscale x 8 x half>, <vscale x 8 x half> }
+      @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(
+          <vscale x 8 x half> %a, <vscale x 8 x half> %b,
+          <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1)
+  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
+        <vscale x 8 x half> } %res
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(
+    i32, <vscale x 16 x i8>)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
+          <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(
+    i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+          <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
+    <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>,
+    <vscale x 16 x i8>, i32)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+          <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(
+    <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 16 x i8>,
+    <vscale x 16 x i8>, i32)
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
+          <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(
+    <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>,
+    <vscale x 16 x i8>, i32)
+
+attributes #0 = { "target-features"="+sme2p3" }
diff --git a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
new file mode 100644
index 0000000000000..ab89e87df66d2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve2p3 < %s | FileCheck %s
+
+target triple = "aarch64-none-linux-gnu"
+
+define <vscale x 16 x i8> @luti6_i8(<vscale x 16 x i8> %a,
+; CHECK-LABEL: luti6_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    luti6 z0.b, { z0.b, z1.b }, z2
+; CHECK-NEXT:    ret
+                                    <vscale x 16 x i8> %b,
+                                    <vscale x 16 x i8> %idx) {
+  %res = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(
+      <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %idx)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @luti6_i16_x2(<vscale x 8 x i16> %a,
+; CHECK-LABEL: luti6_i16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    luti6 z0.h, { z0.h, z1.h }, z2[1]
+; CHECK-NEXT:    ret
+                                        <vscale x 8 x i16> %b,
+                                        <vscale x 16 x i8> %idx) {
+  %res = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
+      <vscale x 8 x i16> %a, <vscale x 8 x i16> %b,
+      <vscale x 16 x i8> %idx, i32 1)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x half> @luti6_f16_x2(<vscale x 8 x half> %a,
+; CHECK-LABEL: luti6_f16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    luti6 z0.h, { z0.h, z1.h }, z2[0]
+; CHECK-NEXT:    ret
+                                         <vscale x 8 x half> %b,
+                                         <vscale x 16 x i8> %idx) {
+  %res = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
+      <vscale x 8 x half> %a, <vscale x 8 x half> %b,
+      <vscale x 16 x i8> %idx, i32 0)
+  ret <vscale x 8 x half> %res
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.luti6(
+    <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
+    <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
+    <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>, i32)
diff --git a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
new file mode 100644
index 0000000000000..0777c1db532b1
--- /dev/null
+++ b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
@@ -0,0 +1,79 @@
+; RUN: not opt -S -passes=verify < %s 2>&1 | FileCheck %s
+
+define <vscale x 8 x i16> @bad_sve_luti6_ret(<vscale x 16 x i8> %a,
+                                             <vscale x 16 x i8> %b,
+                                             <vscale x 16 x i8> %idx) {
+; CHECK: Intrinsic has incorrect return type!
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6(
+      <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %idx)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x i16> @bad_sve_luti6_lane_x2_arg(<vscale x 4 x i32> %a,
+                                                     <vscale x 8 x i16> %b,
+                                                     <vscale x 16 x i8> %idx) {
+; CHECK: Intrinsic has incorrect argument type!
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
+      <vscale x 4 x i32> %a, <vscale x 8 x i16> %b,
+      <vscale x 16 x i8> %idx, i32 1)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x half> @bad_sve_luti6_lane_x2_f16_arg(
+    <vscale x 8 x i16> %a, <vscale x 8 x half> %b, <vscale x 16 x i8> %idx) {
+; CHECK: Intrinsic has incorrect argument type!
+  %res = call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
+      <vscale x 8 x i16> %a, <vscale x 8 x half> %b,
+      <vscale x 16 x i8> %idx, i32 1)
+  ret <vscale x 8 x half> %res
+}
+
+define <vscale x 8 x i16> @bad_sme_luti6_zt_ret(i32 %zt,
+                                                <vscale x 16 x i8> %idx) {
+; CHECK: Intrinsic has incorrect return type!
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(
+      i32 %zt, <vscale x 16 x i8> %idx)
+  ret <vscale x 8 x i16> %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+         <vscale x 8 x i16> } @bad_sme_luti6_zt_x4_ret(i32 %zt,
+                                                       <vscale x 16 x i8> %a,
+                                                       <vscale x 16 x i8> %b,
+                                                       <vscale x 16 x i8> %c) {
+; CHECK: Intrinsic has incorrect return type!
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+                <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(
+      i32 %zt, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
+      <vscale x 16 x i8> %c)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+        <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+         <vscale x 8 x i16> } @bad_sme_luti6_lane_x4_arg(
+             <vscale x 8 x half> %a, <vscale x 8 x i16> %b,
+             <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; CHECK: Intrinsic has incorrect argument type!
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+                <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
+      <vscale x 8 x half> %a, <vscale x 8 x i16> %b,
+      <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+        <vscale x 8 x i16> } %res
+}
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6(
+    <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
+    <vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
+    <vscale x 8 x i16>, <vscale x 8 x half>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(i32, <vscale x 16 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+          <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(
+    i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+          <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
+    <vscale x 8 x half>, <vscale x 8 x i16>, <vscale x 16 x i8>,
+    <vscale x 16 x i8>, i32)

>From d4e3279b4460c5311b6e41d9ccee2c6e37f4a0ee Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 19 Mar 2026 16:39:11 +0000
Subject: [PATCH 02/22] fixup! Address PR comments

---
 clang/lib/Basic/Targets/AArch64.cpp           | 29 ------------------
 clang/lib/Basic/Targets/AArch64.h             |  2 --
 .../Preprocessor/aarch64-target-features.c    | 23 --------------
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  5 +---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  7 +----
 llvm/lib/Target/AArch64/SMEInstrFormats.td    | 22 ++++++++------
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 30 +++++++++++--------
 7 files changed, 33 insertions(+), 85 deletions(-)

diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index ed07e1415b7f5..9afe6cb10729d 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -500,9 +500,6 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasSVE2p1)
     Builder.defineMacro("__ARM_FEATURE_SVE2p1", "1");
 
-  if (HasSVE2p3)
-    Builder.defineMacro("__ARM_FEATURE_SVE2p3", "1");
-
   if (HasSVE2 && HasSVEAES)
     Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1");
 
@@ -529,9 +526,6 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasSME2p1)
     Builder.defineMacro("__ARM_FEATURE_SME2p1", "1");
 
-  if (HasSME2p3)
-    Builder.defineMacro("__ARM_FEATURE_SME2p3", "1");
-
   if (HasSMEF16F16)
     Builder.defineMacro("__ARM_FEATURE_SME_F16F16", "1");
 
@@ -913,11 +907,9 @@ void AArch64TargetInfo::computeFeatureLookup() {
       .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3)
       .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4)
       .Case("sve2p1", FPU & SveMode && HasSVE2p1)
-      .Case("sve2p3", FPU & SveMode && HasSVE2p3)
       .Case("sme", HasSME)
       .Case("sme2", HasSME2)
       .Case("sme2p1", HasSME2p1)
-      .Case("sme2p3", HasSME2p3)
       .Case("sme-f64f64", HasSMEF64F64)
       .Case("sme-i16i64", HasSMEI16I64)
       .Case("sme-fa64", HasSMEFA64)
@@ -1023,15 +1015,6 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasSVE2 = true;
       HasSVE2p1 = true;
     }
-    if (Feature == "+sve2p3") {
-      FPU |= NeonMode;
-      FPU |= SveMode;
-      HasFullFP16 = true;
-      HasSVE2 = true;
-      HasSVE2p1 = true;
-      HasSVE2p2 = true;
-      HasSVE2p3 = true;
-    }
     if (Feature == "+sve-aes") {
       FPU |= NeonMode;
       HasFullFP16 = true;
@@ -1088,18 +1071,6 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasBFloat16 = true;
       HasFullFP16 = true;
     }
-    if (Feature == "+sme2p3") {
-      HasSME = true;
-      HasSME2 = true;
-      HasSVE2 = true;
-      HasSVE2p1 = true;
-      HasSVE2p2 = true;
-      HasSME2p1 = true;
-      HasSME2p2 = true;
-      HasSME2p3 = true;
-      HasBFloat16 = true;
-      HasFullFP16 = true;
-    }
     if (Feature == "+sme-f64f64") {
       HasSME = true;
       HasSMEF64F64 = true;
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index b3c722a7f6d74..0a29bad81939b 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -86,7 +86,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasBFloat16 = false;
   bool HasSVE2 = false;
   bool HasSVE2p1 = false;
-  bool HasSVE2p3 = false;
   bool HasSVEAES = false;
   bool HasSVE2SHA3 = false;
   bool HasSVE2SM4 = false;
@@ -112,7 +111,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasSMEF16F16 = false;
   bool HasSMEB16B16 = false;
   bool HasSME2p1 = false;
-  bool HasSME2p3 = false;
   bool HasFP8 = false;
   bool HasFP8FMA = false;
   bool HasFP8DOT2 = false;
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 6316b25befed8..60ddaad639d48 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -827,32 +827,9 @@
 // CHECK-SVE2p2: __ARM_NEON_FP 0xE
 // CHECK-SVE2p2: __ARM_NEON_SVE_BRIDGE 1
 //
-// RUN: %clang -target aarch64-none-linux-gnu -march=armv9.7-a+sve2p3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2p3 %s
-// CHECK-SVE2p3: __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
-// CHECK-SVE2p3: __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
-// CHECK-SVE2p3: __ARM_FEATURE_SVE 1
-// CHECK-SVE2p3: __ARM_FEATURE_SVE2 1
-// CHECK-SVE2p3: __ARM_FEATURE_SVE2p1 1
-// CHECK-SVE2p3: __ARM_FEATURE_SVE2p2 1
-// CHECK-SVE2p3: __ARM_FEATURE_SVE2p3 1
-// CHECK-SVE2p3: __ARM_NEON 1
-// CHECK-SVE2p3: __ARM_NEON_FP 0xE
-// CHECK-SVE2p3: __ARM_NEON_SVE_BRIDGE 1
-// CHECK-SVE2p3-NOT: __ARM_FEATURE_SME2p3 1
-//
 // RUN: %clang --target=aarch64 -march=armv9-a+sme2p2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p2 %s
 // CHECK-SME2p2: __ARM_FEATURE_LOCALLY_STREAMING 1
 // CHECK-SME2p2: __ARM_FEATURE_SME 1
 // CHECK-SME2p2: __ARM_FEATURE_SME2 1
 // CHECK-SME2p2: __ARM_FEATURE_SME2p1 1
 // CHECK-SME2p2: __ARM_FEATURE_SME2p2 1
-//
-// RUN: %clang --target=aarch64 -march=armv9.7-a+sme2p3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p3 %s
-// CHECK-SME2p3: __ARM_FEATURE_LOCALLY_STREAMING 1
-// CHECK-SME2p3: __ARM_FEATURE_SME 1
-// CHECK-SME2p3: __ARM_FEATURE_SME2 1
-// CHECK-SME2p3: __ARM_FEATURE_SME2p1 1
-// CHECK-SME2p3: __ARM_FEATURE_SME2p2 1
-// CHECK-SME2p3: __ARM_FEATURE_SME2p3 1
-// CHECK-SME2p3: __ARM_FEATURE_SVE2p1 1
-// CHECK-SME2p3: __ARM_FEATURE_SVE2p2 1
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 9afeae7c25de1..d0eb9ca218a27 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -1140,10 +1140,7 @@ let Predicates = [HasSME_MOP4, HasSMEF64F64] in {
 // SME2.3 instructions
 //===----------------------------------------------------------------------===//
 let Predicates = [HasSME2p3] in {
-  def LUTI6_ZTZ       : sme2_lut_single<"luti6">;
-  def : Pat<(nxv16i8 (int_aarch64_sme_luti6_zt (imm_to_zt untyped:$zt),
-                      nxv16i8:$zn)),
-            (LUTI6_ZTZ $zt, nxv16i8:$zn)>;
+  defm LUTI6_ZTZ      : sme2_lut_single<"luti6", int_aarch64_sme_luti6_zt>;
   def LUTI6_4ZT3Z     : sme2_luti6_zt_consecutive<"luti6">;
   def LUTI6_S_4ZT3Z   : sme2_luti6_zt_strided<"luti6">;
   def LUTI6_4Z2Z2ZI   : sme2_luti6_vector_vg4_consecutive<"luti6">;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index a5856bbe24d1f..3fde260522f75 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4907,12 +4907,7 @@ let Predicates = [HasNonStreamingSVE2p3_or_SME2p3] in {
 // SVE2.3 instructions
 //===----------------------------------------------------------------------===//
 let Predicates = [HasNonStreamingSVE2p3] in {
-  def LUTI6_Z2ZZ : sve2_luti6_vector<"luti6">;
-  def : Pat<(nxv16i8 (int_aarch64_sve_luti6 nxv16i8:$Op1, nxv16i8:$Op2,
-                      nxv16i8:$Op3)),
-            (LUTI6_Z2ZZ (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0,
-                                           nxv16i8:$Op2, zsub1),
-                         nxv16i8:$Op3)>;
+  defm LUTI6_Z2ZZ : sve2_luti6_vector<"luti6", int_aarch64_sve_luti6>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 771c4c1fb2b6e..31fff8767fbdd 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -3921,15 +3921,19 @@ multiclass sme2_luti4_vector_vg4_index<string mnemonic> {
 }
 
 // 8-bit Look up table
-class sme2_lut_single<string asm>
-  : I<(outs ZPR8:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn),
-    asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> {
-  bits<0> ZTt;
-  bits<5> Zd;
-  bits<5> Zn;
-  let Inst{31-10} = 0b1100000011001000010000;
-  let Inst{9-5}   = Zn;
-  let Inst{4-0}   = Zd;
+multiclass sme2_lut_single<string asm, SDPatternOperator intrinsic> {
+  def NAME : I<(outs ZPR8:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn),
+                asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> {
+    bits<0> ZTt;
+    bits<5> Zd;
+    bits<5> Zn;
+    let Inst{31-10} = 0b1100000011001000010000;
+    let Inst{9-5}   = Zn;
+    let Inst{4-0}   = Zd;
+  }
+
+  def : Pat<(nxv16i8 (intrinsic (imm_to_zt untyped:$zt), nxv16i8:$zn)),
+            (!cast<Instruction>(NAME) $zt, nxv16i8:$zn)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 1619bfe41117f..dd6811b3d8a2b 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11434,18 +11434,24 @@ multiclass sve2_luti6_vector_index<string mnemonic> {
 }
 
 // Look up table
-class sve2_luti6_vector<string mnemonic>
-    : I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, ZPRAny:$Zm),
-      mnemonic, "\t$Zd, $Zn, $Zm",
-      "", []>, Sched<[]> {
-  bits<5> Zd;
-  bits<5> Zn;
-  bits<5> Zm;
-  let Inst{31-21} = 0b01000101001;
-  let Inst{20-16} = Zm;
-  let Inst{15-10} = 0b101011;
-  let Inst{9-5}   = Zn;
-  let Inst{4-0}   = Zd;
+multiclass sve2_luti6_vector<string mnemonic, SDPatternOperator intrinsic> {
+  def NAME : I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, ZPRAny:$Zm),
+                mnemonic, "\t$Zd, $Zn, $Zm",
+                "", []>, Sched<[]> {
+    bits<5> Zd;
+    bits<5> Zn;
+    bits<5> Zm;
+    let Inst{31-21} = 0b01000101001;
+    let Inst{20-16} = Zm;
+    let Inst{15-10} = 0b101011;
+    let Inst{9-5}   = Zn;
+    let Inst{4-0}   = Zd;
+  }
+
+  def : Pat<(nxv16i8 (intrinsic nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)),
+            (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0,
+                                                     nxv16i8:$Op2, zsub1),
+                                       nxv16i8:$Op3)>;
 }
 
 //===----------------------------------------------------------------------===//

>From 66f472127330d6a383deb9446197153418ecc6f4 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 19 Mar 2026 19:39:05 +0000
Subject: [PATCH 03/22] fixup! Reuse SelectMultiVectorLuti()

---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 42 +++----------------
 1 file changed, 6 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 1f6f944b761d4..c5851e3fcdde1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -418,9 +418,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs,
                                       unsigned Opc, uint32_t MaxImm);
 
-  void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc);
-  void SelectMultiVectorLutiZT(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
-                               unsigned NumInVecs);
+  void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
+                             unsigned NumInVecs);
 
   template <unsigned MaxIdx, unsigned Scale>
   bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
@@ -2325,37 +2324,8 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
 
 void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
                                                 unsigned NumOutVecs,
-                                                unsigned Opc) {
-  SDValue ZtValue;
-  if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
-    return;
-
-  SDValue Chain = Node->getOperand(0);
-  SDValue Ops[] = {ZtValue,
-                   createZMulTuple({Node->getOperand(3), Node->getOperand(4)}),
-                   Chain};
-
-  SDLoc DL(Node);
-  EVT VT = Node->getValueType(0);
-
-  SDNode *Instruction =
-      CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
-  SDValue SuperReg = SDValue(Instruction, 0);
-
-  for (unsigned I = 0; I < NumOutVecs; ++I)
-    ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
-                                      AArch64::zsub0 + I, DL, VT, SuperReg));
-
-  // Copy chain
-  unsigned ChainIdx = NumOutVecs;
-  ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1));
-  CurDAG->RemoveDeadNode(Node);
-}
-
-void AArch64DAGToDAGISel::SelectMultiVectorLutiZT(SDNode *Node,
-                                                  unsigned NumOutVecs,
-                                                  unsigned Opc,
-                                                  unsigned NumInVecs) {
+                                                unsigned Opc,
+                                                unsigned NumInVecs) {
   const unsigned ChainOp = 0;
   const unsigned ZtOp = 2;
   const unsigned FirstVecOp = 3;
@@ -6083,11 +6053,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       return;
     }
     case Intrinsic::aarch64_sme_luti4_zt_x4: {
-      SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z);
+      SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z, 2);
       return;
     }
     case Intrinsic::aarch64_sme_luti6_zt_x4: {
-      SelectMultiVectorLutiZT(Node, 4, AArch64::LUTI6_4ZT3Z, 3);
+      SelectMultiVectorLuti(Node, 4, AArch64::LUTI6_4ZT3Z, 3);
       return;
     }
     case Intrinsic::aarch64_sve_fp8_cvtl1_x2:

>From b3bbd5c6dfeab724e96388437468498816bde8ea Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 19 Mar 2026 19:49:15 +0000
Subject: [PATCH 04/22] fixup! Add overloaded
 AArch64DAGToDAGISel::EmitMultiVectorLutiLane() for reuse

---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 89 ++++++++-----------
 1 file changed, 39 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index c5851e3fcdde1..deecb2312aed7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -415,6 +415,9 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
 
   void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
                                  unsigned Opc, uint32_t MaxImm);
+  void EmitMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
+                               unsigned Opc, ArrayRef<SDValue> Ops,
+                               bool HasChain);
   void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs,
                                       unsigned Opc, uint32_t MaxImm);
 
@@ -2246,6 +2249,35 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
   SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode);
 }
 
+void AArch64DAGToDAGISel::EmitMultiVectorLutiLane(SDNode *Node,
+                                                  unsigned NumOutVecs,
+                                                  unsigned Opc,
+                                                  ArrayRef<SDValue> Ops,
+                                                  bool HasChain) {
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+
+  SmallVector<SDValue, 4> MachineOps(Ops);
+  SDNode *Instruction;
+  if (HasChain) {
+    MachineOps.push_back(Node->getOperand(0));
+    Instruction =
+        CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, MachineOps);
+  } else {
+    Instruction = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, MachineOps);
+  }
+  SDValue SuperReg(Instruction, 0);
+
+  for (unsigned i = 0; i < NumOutVecs; ++i)
+    ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg(
+                                      AArch64::zsub0 + i, DL, VT, SuperReg));
+
+  if (HasChain)
+    ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1));
+
+  CurDAG->RemoveDeadNode(Node);
+}
+
 void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
                                                     unsigned NumOutVecs,
                                                     unsigned Opc,
@@ -2258,68 +2290,25 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
   if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
     return;
 
-  SDValue Chain = Node->getOperand(0);
-  SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4), Chain};
-  SDLoc DL(Node);
-  EVT VT = Node->getValueType(0);
-
-  SDNode *Instruction =
-      CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
-  SDValue SuperReg = SDValue(Instruction, 0);
-
-  for (unsigned I = 0; I < NumOutVecs; ++I)
-    ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
-                                      AArch64::zsub0 + I, DL, VT, SuperReg));
-
-  // Copy chain
-  unsigned ChainIdx = NumOutVecs;
-  ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1));
-  CurDAG->RemoveDeadNode(Node);
+  SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
+  EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops, /*HasChain=*/true);
 }
 
 void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
                                                          unsigned NumOutVecs,
                                                          unsigned Opc,
                                                          uint32_t MaxImm) {
-  const bool HasChain = Node->getOpcode() == ISD::INTRINSIC_W_CHAIN;
-  const unsigned BaseOp = HasChain ? 1 : 0;
-  const unsigned t0 = BaseOp + 1;
-  const unsigned t1 = BaseOp + 2;
-  const unsigned i0 = BaseOp + 3;
-  const unsigned i1 = BaseOp + 4;
-  const unsigned ImmOp = BaseOp + 5;
-
-  SDValue ImmVal = Node->getOperand(ImmOp);
+  SDValue ImmVal = Node->getOperand(5);
   if (auto *Imm = dyn_cast<ConstantSDNode>(ImmVal))
     if (Imm->getZExtValue() > MaxImm)
       return;
 
-  SDLoc DL(Node);
-  EVT VT = Node->getValueType(0);
   SmallVector<SDValue, 4> Ops = {
-      createZTuple({Node->getOperand(t0), Node->getOperand(t1)}),
-      createZTuple({Node->getOperand(i0), Node->getOperand(i1)}),
-      Node->getOperand(ImmOp),
+      createZTuple({Node->getOperand(1), Node->getOperand(2)}),
+      createZTuple({Node->getOperand(3), Node->getOperand(4)}),
+      Node->getOperand(5),
   };
-
-  SDNode *Instruction;
-  if (HasChain) {
-    Ops.push_back(Node->getOperand(0));
-    Instruction =
-        CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
-  } else {
-    Instruction = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops);
-  }
-  SDValue SuperReg(Instruction, 0);
-
-  for (unsigned i = 0; i < NumOutVecs; ++i)
-    ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg(
-                                      AArch64::zsub0 + i, DL, VT, SuperReg));
-
-  if (HasChain)
-    ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1));
-
-  CurDAG->RemoveDeadNode(Node);
+  EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops, /*HasChain=*/false);
 }
 
 void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,

>From 1d3b24a66ff87953126a1d9322f28a767b7558f4 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 23 Mar 2026 15:51:29 +0000
Subject: [PATCH 05/22] fixup! Address PR comments

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td    | 11 +----------
 llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td |  4 ++--
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 09e6b3e6879ea..95b4bc86a3fe6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -227,7 +227,7 @@ def HasSVE_B16MM     : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasS
                                  AssemblerPredicateWithAll<(all_of FeatureSVE_B16MM), "sve-b16mm">;
 def HasF16MM         : Predicate<"Subtarget->hasF16MM()">,
                                  AssemblerPredicateWithAll<(all_of FeatureF16MM), "f16mm">;
-def HasSVE2p3        : Predicate<"Subtarget->hasSVE2p3()">,
+def HasSVE2p3        : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p3()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">;
 def HasSME2p3        : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p3()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME2p3), "sme2p3">;
@@ -313,15 +313,6 @@ def HasNonStreamingSVE2p2_or_SME2p2
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2),
                 "sme2p2 or sve2p2">;
-def HasNonStreamingSVE2p3
-    : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p3()">,
-                AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">;
-def HasNonStreamingSVE2p3_or_SME2p3
-    : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p3()) ||"
-                "(Subtarget->isStreaming() && Subtarget->hasSME2p3())">,
-                AssemblerPredicateWithAll<(any_of FeatureSVE2p3, FeatureSME2p3),
-                "sme2p3 or sve2p3">;
-
 def HasSMEF16F16_or_SMEF8F16
     : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">,
                 AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3fde260522f75..4f49ef1b795ca 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4899,14 +4899,14 @@ let Predicates = [HasSVE2p3_or_SME2p3] in {
 
 } // End HasSME2p3orSVE2p3
 
-let Predicates = [HasNonStreamingSVE2p3_or_SME2p3] in {
+let Predicates = [HasSVE2p3_or_SME2p3] in {
   defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">;
 }
 
 //===----------------------------------------------------------------------===//
 // SVE2.3 instructions
 //===----------------------------------------------------------------------===//
-let Predicates = [HasNonStreamingSVE2p3] in {
+let Predicates = [HasSVE2p3] in {
   defm LUTI6_Z2ZZ : sve2_luti6_vector<"luti6", int_aarch64_sve_luti6>;
 }
 

>From 523dd9533b0c81e50305b4eb50290cf7d96b9c85 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 23 Mar 2026 21:13:29 +0000
Subject: [PATCH 06/22] fixup! Address more PR comments

---
 clang/include/clang/Basic/arm_sme.td          |  1 -
 clang/include/clang/Basic/arm_sve.td          |  4 ++--
 clang/lib/Sema/SemaARM.cpp                    | 24 ++++++++++++++++---
 .../sme2p3-intrinsics/acle_sme2p3_luti6.c     | 16 +++++++++----
 .../sve2p3-intrinsics/acle_sve2p3_luti6.c     | 12 +++++-----
 .../acle_sme2p3_target_lane.c                 |  2 +-
 .../acle_sve2p3_target_lane.c                 |  9 +++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     | 15 ++++--------
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  5 ++--
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  4 ++--
 10 files changed, 59 insertions(+), 33 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 8de360fca5f5e..678fa1efc2a51 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -984,7 +984,6 @@ let SMETargetGuard = "sme-lutv2" in {
 let SMETargetGuard = "sme2p3" in {
   def SVLUTI6_ZT      : SInst<"svluti6_zt_{d}", "diu", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
   def SVLUTI6_ZT_X4   : SInst<"svluti6_zt_{d}_x4", "4i3.u", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
-  def SVLUTI6_LANE_X4 : SInst<"svluti6_lane[_{d}_x4]", "42.d2.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
 }
 
 let SMETargetGuard = "sme-f8f32" in {
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 6d2bd0d30455f..2760aec2f23bf 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1924,8 +1924,8 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
 }
 
 let SVETargetGuard = "sve2p3", SMETargetGuard = "sme2p3" in {
-  def SVLUTI6_x2_I16 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUs", MergeNone, "aarch64_sve_luti6_lane_x2_i16", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
-  def SVLUTI6_x2_F16 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "h", MergeNone, "aarch64_sve_luti6_lane_x2_f16", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUsh", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI6_x4 : SInst<"svluti6_lane[_{d}_x4]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 5e7504fab416d..e7b86f998a509 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Sema/SemaARM.h"
+#include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
@@ -617,9 +618,26 @@ static bool checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
       BuiltinType = SemaARM::ArmNonStreaming;
     else if (SatisfiesSME)
       BuiltinType = SemaARM::ArmStreaming;
-    else
-      // This should be diagnosed by CodeGen
-      return false;
+    else {
+      switch (BuiltinID) {
+      case SVE::BI__builtin_sve_svluti6_lane_bf16_x4:
+      case SVE::BI__builtin_sve_svluti6_lane_f16_x4:
+      case SVE::BI__builtin_sve_svluti6_lane_s16_x4:
+      case SVE::BI__builtin_sve_svluti6_lane_u16_x4: {
+        std::string BuiltinName =
+            std::string(S.Context.BuiltinInfo.getQuotedName(BuiltinID));
+        const FunctionDecl *Callee = TheCall->getDirectCallee();
+        if (Callee)
+          BuiltinName = "'" + Callee->getName().str() + "'";
+        S.Diag(TheCall->getBeginLoc(), diag::err_builtin_needs_feature)
+            << BuiltinName << RequiredFeatures;
+        return true;
+      }
+      default:
+        // This should be diagnosed by CodeGen.
+        return false;
+      }
+    }
   }
 
   if (FnType != SemaARM::ArmNonStreaming &&
diff --git a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
index ae5fb1f64d0fc..02dac71bb8de7 100644
--- a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
@@ -3,10 +3,18 @@
 
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sme.h>
 
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3_UNUSED,A4_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_s16_x4(
 // CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -21,7 +29,7 @@
 //
 svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return svluti6_lane_s16_x4(table, indices, 1);
+  return SVE_ACLE_FUNC(svluti6_lane,_s16,_x4,)(table, indices, 1);
 }
 
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_u16_x4(
@@ -38,7 +46,7 @@ svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
 //
 svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return svluti6_lane_u16_x4(table, indices, 0);
+  return SVE_ACLE_FUNC(svluti6_lane,_u16,_x4,)(table, indices, 0);
 }
 
 // CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti6_lane_f16_x4(
@@ -55,7 +63,7 @@ svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
 //
 svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return svluti6_lane_f16_x4(table, indices, 1);
+  return SVE_ACLE_FUNC(svluti6_lane,_f16,_x4,)(table, indices, 1);
 }
 
 // CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti6_lane_bf16_x4(
@@ -72,7 +80,7 @@ svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
 //
 svbfloat16x4_t test_svluti6_lane_bf16_x4(svbfloat16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return svluti6_lane_bf16_x4(table, indices, 0);
+  return SVE_ACLE_FUNC(svluti6_lane,_bf16,_x4,)(table, indices, 0);
 }
 
 // CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_s8(
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
index a806ef0b13c20..b70a83b91a5af 100644
--- a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
@@ -66,13 +66,13 @@ svmfloat8_t test_svluti6_mf8(svmfloat8x2_t table, svuint8_t indices) {
 // CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
 // CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) {
@@ -82,13 +82,13 @@ svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) {
 // CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
 // CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) {
@@ -98,13 +98,13 @@ svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) {
 // CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
 // CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) {
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
index 1a06663a9aab7..74bdbf8723fed 100644
--- a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
@@ -6,7 +6,7 @@
 
 svbfloat16x4_t missing_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature sme,sme2p3}}
+  return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature (sve,sve2p3)|(sme,sme2p3)}}
 }
 
 __attribute__((target("sme2p3,bf16")))
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
index 6a2465f4027fc..2aec0c5daa039 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
@@ -12,3 +12,12 @@ __attribute__((target("sve2p3")))
 svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
   return svluti6_lane_f16_x2(table, indices, 0);
 }
+
+svfloat16x4_t missing_sve2p3_luti6_lane_x4(svfloat16x2_t table, svuint8x2_t indices) {
+  return svluti6_lane_f16_x4(table, indices, 1); // expected-error {{'svluti6_lane_f16_x4' needs target feature (sve,sve2p3)|(sme,sme2p3)}}
+}
+
+__attribute__((target("sve2p3")))
+svfloat16x4_t has_sve2p3_luti6_lane_x4(svfloat16x2_t table, svuint8x2_t indices) {
+  return svluti6_lane_f16_x4(table, indices, 0);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 53c9dcca1deba..a5f2e92faeedf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2826,17 +2826,10 @@ def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                     llvm_nxv16i8_ty,
                                     llvm_i32_ty],
                                     [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
-def int_aarch64_sve_luti6_lane_x2_i16
-    : DefaultAttrsIntrinsic<[llvm_nxv8i16_ty],
-                            [llvm_nxv8i16_ty,
-                             llvm_nxv8i16_ty,
-                             llvm_nxv16i8_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
-def int_aarch64_sve_luti6_lane_x2_f16
-    : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty],
-                            [llvm_nxv8f16_ty,
-                             llvm_nxv8f16_ty,
+def int_aarch64_sve_luti6_lane_x2
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             LLVMMatchType<0>,
                              llvm_nxv16i8_ty,
                              llvm_i32_ty],
                             [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index deecb2312aed7..ce545137b257f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -415,9 +415,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
 
   void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
                                  unsigned Opc, uint32_t MaxImm);
-  void EmitMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
-                               unsigned Opc, ArrayRef<SDValue> Ops,
-                               bool HasChain);
+  void EmitMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
+                               ArrayRef<SDValue> Ops, bool HasChain);
   void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs,
                                       unsigned Opc, uint32_t MaxImm);
 
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index dd6811b3d8a2b..9b0abfbee790d 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11421,12 +11421,12 @@ multiclass sve2_luti6_vector_index<string mnemonic> {
     let Inst{23} = idx;
   }
 
-  def : Pat<(nxv8i16 (int_aarch64_sve_luti6_lane_x2_i16 nxv8i16:$Op1, nxv8i16:$Op2,
+  def : Pat<(nxv8i16 (int_aarch64_sve_luti6_lane_x2 nxv8i16:$Op1, nxv8i16:$Op2,
                       nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
             (nxv8i16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
                                                                       nxv8i16:$Op2, zsub1),
                                                 nxv16i8:$Op3, timm32_0_1:$Op4))>;
-  def : Pat<(nxv8f16 (int_aarch64_sve_luti6_lane_x2_f16 nxv8f16:$Op1, nxv8f16:$Op2,
+  def : Pat<(nxv8f16 (int_aarch64_sve_luti6_lane_x2 nxv8f16:$Op1, nxv8f16:$Op2,
                       nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
             (nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
                                                                       nxv8f16:$Op2, zsub1),

>From 2342033a866012ecbb41bd034693e98e838084c9 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 23 Mar 2026 22:55:41 +0000
Subject: [PATCH 07/22] fixup! Fix final PR comments for now

---
 clang/include/clang/Basic/arm_sve.td          |   2 +-
 .../sve2p3-intrinsics/acle_sve2p3_luti6.c     | 126 ++++++++++++++++--
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |   5 +
 .../test/Verifier/AArch64/luti6-intrinsics.ll |   1 +
 4 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 2760aec2f23bf..c82971f2dffa4 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1924,7 +1924,7 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
 }
 
 let SVETargetGuard = "sve2p3", SMETargetGuard = "sme2p3" in {
-  def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUsh", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
   def SVLUTI6_x4 : SInst<"svluti6_lane[_{d}_x4]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
 }
 
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
index b70a83b91a5af..59d24b641f9d5 100644
--- a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
@@ -1,11 +1,15 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // REQUIRES: aarch64-registered-target
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s --check-prefix=STREAM-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=STREAM-CPP-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s --check-prefix=STREAM-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=STREAM-CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sve.h>
 
@@ -15,18 +19,36 @@
 #define SVE_ACLE_FUNC(A1, A2) A1##A2
 #endif
 
+#ifdef STREAMING_MODE
+#define STREAMING_ATTR __arm_streaming
+#else
+#define STREAMING_ATTR
+#endif
+
 // CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8(
 // CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// STREAM-CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8(
+// STREAM-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// STREAM-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_s810svint8x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_s810svint8x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// STREAM-CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
 svint8_t test_svluti6_s8(svint8x2_t table, svuint8_t indices) {
   return SVE_ACLE_FUNC(svluti6, _s8)(table, indices);
 }
@@ -37,12 +59,24 @@ svint8_t test_svluti6_s8(svint8x2_t table, svuint8_t indices) {
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// STREAM-CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_u8(
+// STREAM-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// STREAM-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_u811svuint8x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_u811svuint8x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// STREAM-CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
 svuint8_t test_svluti6_u8(svuint8x2_t table, svuint8_t indices) {
   return SVE_ACLE_FUNC(svluti6, _u8)(table, indices);
 }
@@ -53,12 +87,24 @@ svuint8_t test_svluti6_u8(svuint8x2_t table, svuint8_t indices) {
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// STREAM-CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_mf8(
+// STREAM-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// STREAM-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svluti6_mf813svmfloat8x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svluti6_mf813svmfloat8x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// STREAM-CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
 svmfloat8_t test_svluti6_mf8(svmfloat8x2_t table, svuint8_t indices) {
   return SVE_ACLE_FUNC(svluti6, _mf8)(table, indices);
 }
@@ -69,13 +115,25 @@ svmfloat8_t test_svluti6_mf8(svmfloat8x2_t table, svuint8_t indices) {
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) {
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) STREAMING_ATTR {
   return SVE_ACLE_FUNC(svluti6_lane, _s16_x2)(table, indices, 1);
 }
 
@@ -85,13 +143,25 @@ svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) {
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) {
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) STREAMING_ATTR {
   return SVE_ACLE_FUNC(svluti6_lane, _u16_x2)(table, indices, 0);
 }
 
@@ -101,12 +171,52 @@ svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) {
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
 // CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) {
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) STREAMING_ATTR {
   return SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(table, indices, 1);
 }
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svluti6_lane_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svluti6_lane_bf16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z25test_svluti6_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z25test_svluti6_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svluti6_lane_bf16_x2(svbfloat16x2_t table, svuint8_t indices) STREAMING_ATTR {
+  return SVE_ACLE_FUNC(svluti6_lane, _bf16_x2)(table, indices, 0);
+}
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 9b0abfbee790d..a16e364fb6d32 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11431,6 +11431,11 @@ multiclass sve2_luti6_vector_index<string mnemonic> {
             (nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
                                                                       nxv8f16:$Op2, zsub1),
                                                 nxv16i8:$Op3, timm32_0_1:$Op4))>;
+  def : Pat<(nxv8bf16 (int_aarch64_sve_luti6_lane_x2 nxv8bf16:$Op1, nxv8bf16:$Op2,
+                       nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
+            (nxv8bf16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0,
+                                                                        nxv8bf16:$Op2, zsub1),
+                                                 nxv16i8:$Op3, timm32_0_1:$Op4))>;
 }
 
 // Look up table
diff --git a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
index 0777c1db532b1..7818dd19ffb1a 100644
--- a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
+++ b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: not opt -S -passes=verify < %s 2>&1 | FileCheck %s
 
 define <vscale x 8 x i16> @bad_sve_luti6_ret(<vscale x 16 x i8> %a,

>From 4bdd136d1649800dd89555b702d91152c3f7add8 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 23 Mar 2026 23:30:23 +0000
Subject: [PATCH 08/22] fixup! Address more PR comments

---
 clang/include/clang/Basic/arm_sve.td          |  5 +++-
 clang/lib/Sema/SemaARM.cpp                    | 23 +++----------------
 .../acle_sme2p3_target_lane.c                 |  2 +-
 .../acle_sve2p3_target_lane.c                 |  9 --------
 4 files changed, 8 insertions(+), 31 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index c82971f2dffa4..d3526d4950a06 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1925,7 +1925,10 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
 
 let SVETargetGuard = "sve2p3", SMETargetGuard = "sme2p3" in {
   def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
-  def SVLUTI6_x4 : SInst<"svluti6_lane[_{d}_x4]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+}
+
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2p3" in {
+  def SVLUTI6_x4 : SInst<"svluti6_lane[_{d}_x4]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index e7b86f998a509..bb99f3fd0ce94 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -618,26 +618,9 @@ static bool checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
       BuiltinType = SemaARM::ArmNonStreaming;
     else if (SatisfiesSME)
       BuiltinType = SemaARM::ArmStreaming;
-    else {
-      switch (BuiltinID) {
-      case SVE::BI__builtin_sve_svluti6_lane_bf16_x4:
-      case SVE::BI__builtin_sve_svluti6_lane_f16_x4:
-      case SVE::BI__builtin_sve_svluti6_lane_s16_x4:
-      case SVE::BI__builtin_sve_svluti6_lane_u16_x4: {
-        std::string BuiltinName =
-            std::string(S.Context.BuiltinInfo.getQuotedName(BuiltinID));
-        const FunctionDecl *Callee = TheCall->getDirectCallee();
-        if (Callee)
-          BuiltinName = "'" + Callee->getName().str() + "'";
-        S.Diag(TheCall->getBeginLoc(), diag::err_builtin_needs_feature)
-            << BuiltinName << RequiredFeatures;
-        return true;
-      }
-      default:
-        // This should be diagnosed by CodeGen.
-        return false;
-      }
-    }
+    else
+      // This should be diagnosed by CodeGen.
+      return false;
   }
 
   if (FnType != SemaARM::ArmNonStreaming &&
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
index 74bdbf8723fed..1a06663a9aab7 100644
--- a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
@@ -6,7 +6,7 @@
 
 svbfloat16x4_t missing_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature (sve,sve2p3)|(sme,sme2p3)}}
+  return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature sme,sme2p3}}
 }
 
 __attribute__((target("sme2p3,bf16")))
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
index 2aec0c5daa039..6a2465f4027fc 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
@@ -12,12 +12,3 @@ __attribute__((target("sve2p3")))
 svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
   return svluti6_lane_f16_x2(table, indices, 0);
 }
-
-svfloat16x4_t missing_sve2p3_luti6_lane_x4(svfloat16x2_t table, svuint8x2_t indices) {
-  return svluti6_lane_f16_x4(table, indices, 1); // expected-error {{'svluti6_lane_f16_x4' needs target feature (sve,sve2p3)|(sme,sme2p3)}}
-}
-
-__attribute__((target("sve2p3")))
-svfloat16x4_t has_sve2p3_luti6_lane_x4(svfloat16x2_t table, svuint8x2_t indices) {
-  return svluti6_lane_f16_x4(table, indices, 0);
-}

>From a50ee6664d3ac744b288523037059acd11b3abb0 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Wed, 8 Apr 2026 14:21:23 +0100
Subject: [PATCH 09/22] fixup! Fix PR comments

---
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  1 -
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 53 ++++-------
 .../AArch64/sme2p3-intrinsics-luti6.ll        | 94 +++++--------------
 .../AArch64/sve2p3-intrinsics-luti6.ll        | 48 +++-------
 .../test/Verifier/AArch64/luti6-intrinsics.ll | 74 ++++-----------
 5 files changed, 75 insertions(+), 195 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index a5f2e92faeedf..8b6b9e403f16f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1049,7 +1049,6 @@ def llvm_nxv4i1_ty  : LLVMType<nxv4i1>;
 def llvm_nxv8i1_ty  : LLVMType<nxv8i1>;
 def llvm_nxv16i1_ty : LLVMType<nxv16i1>;
 def llvm_nxv16i8_ty : LLVMType<nxv16i8>;
-def llvm_nxv8i16_ty : LLVMType<nxv8i16>;
 def llvm_nxv4i32_ty : LLVMType<nxv4i32>;
 def llvm_nxv2i64_ty : LLVMType<nxv2i64>;
 def llvm_nxv8f16_ty : LLVMType<nxv8f16>;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index ce545137b257f..1821c407681a0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -416,7 +416,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
                                  unsigned Opc, uint32_t MaxImm);
   void EmitMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
-                               ArrayRef<SDValue> Ops, bool HasChain);
+                               ArrayRef<SDValue> Ops);
   void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs,
                                       unsigned Opc, uint32_t MaxImm);
 
@@ -2251,20 +2251,18 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
 void AArch64DAGToDAGISel::EmitMultiVectorLutiLane(SDNode *Node,
                                                   unsigned NumOutVecs,
                                                   unsigned Opc,
-                                                  ArrayRef<SDValue> Ops,
-                                                  bool HasChain) {
+                                                  ArrayRef<SDValue> Ops) {
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
+  bool HasChain = Node->getOpcode() == ISD::INTRINSIC_W_CHAIN;
 
   SmallVector<SDValue, 4> MachineOps(Ops);
-  SDNode *Instruction;
-  if (HasChain) {
+  SmallVector<EVT, 2> ResTys = {MVT::Untyped};
+  if (HasChain)
     MachineOps.push_back(Node->getOperand(0));
-    Instruction =
-        CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, MachineOps);
-  } else {
-    Instruction = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, MachineOps);
-  }
+  if (HasChain)
+    ResTys.push_back(MVT::Other);
+  SDNode *Instruction = CurDAG->getMachineNode(Opc, DL, ResTys, MachineOps);
   SDValue SuperReg(Instruction, 0);
 
   for (unsigned i = 0; i < NumOutVecs; ++i)
@@ -2290,7 +2288,7 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
     return;
 
   SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
-  EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops, /*HasChain=*/true);
+  EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops);
 }
 
 void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
@@ -2307,37 +2305,20 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
       createZTuple({Node->getOperand(3), Node->getOperand(4)}),
       Node->getOperand(5),
   };
-  EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops, /*HasChain=*/false);
+  EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops);
 }
 
 void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
                                                 unsigned NumOutVecs,
                                                 unsigned Opc,
                                                 unsigned NumInVecs) {
-  const unsigned ChainOp = 0;
-  const unsigned ZtOp = 2;
-  const unsigned FirstVecOp = 3;
-
   SDValue ZtValue;
-  if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(ZtOp), ZtValue))
+  if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
     return;
 
-  SDValue ZTuple;
-  switch (NumInVecs) {
-  case 2:
-    ZTuple = createZMulTuple(
-        {Node->getOperand(FirstVecOp), Node->getOperand(FirstVecOp + 1)});
-    break;
-  case 3:
-    ZTuple = createZTuple({Node->getOperand(FirstVecOp),
-                           Node->getOperand(FirstVecOp + 1),
-                           Node->getOperand(FirstVecOp + 2)});
-    break;
-  default:
-    llvm_unreachable("unexpected LUTI ZT tuple width");
-  }
-
-  SDValue Ops[] = {ZtValue, ZTuple, Node->getOperand(ChainOp)};
+  SmallVector<SDValue, 4> Regs(Node->ops().slice(3, NumInVecs));
+  SDValue ZTuple = NumInVecs == 2 ? createZMulTuple(Regs) : createZTuple(Regs);
+  SDValue Ops[] = {ZtValue, ZTuple, Node->getOperand(0)};
 
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
@@ -2346,9 +2327,9 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
       CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
   SDValue SuperReg(Instruction, 0);
 
-  for (unsigned i = 0; i < NumOutVecs; ++i)
-    ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg(
-                                      AArch64::zsub0 + i, DL, VT, SuperReg));
+  for (unsigned I = 0; I < NumOutVecs; ++I)
+    ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
+                                      AArch64::zsub0 + I, DL, VT, SuperReg));
 
   ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1));
   CurDAG->RemoveDeadNode(Node);
diff --git a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
index 07fb62baa58cd..95055414ef562 100644
--- a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -verify-machineinstrs -force-streaming -mtriple=aarch64-none-linux-gnu -mattr=+sme2p3 < %s | FileCheck %s
 
-target triple = "aarch64-none-linux-gnu"
-
 define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 {
 ; CHECK-LABEL: luti6_zt_i8:
 ; CHECK:       // %bb.0:
@@ -13,93 +11,49 @@ define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 {
   ret <vscale x 16 x i8> %res
 }
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
-         <vscale x 16 x i8> } @luti6_zt_i8_x4(<vscale x 16 x i8> %a,
-                                              <vscale x 16 x i8> %b,
-                                              <vscale x 16 x i8> %c) #0 {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @luti6_zt_i8_x4(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) #0 {
 ; CHECK-LABEL: luti6_zt_i8_x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    luti6 { z0.b - z3.b }, zt0, { z0 - z2 }
 ; CHECK-NEXT:    ret
-  %res = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>,
-                     <vscale x 16 x i8>, <vscale x 16 x i8> }
-      @llvm.aarch64.sme.luti6.zt.x4(
-          i32 0, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
-          <vscale x 16 x i8> %c)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
-        <vscale x 16 x i8> } %res
+  %res = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c)
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
-         <vscale x 8 x i16> } @luti6_i16_x4(<vscale x 8 x i16> %a,
-                                            <vscale x 8 x i16> %b,
-                                            <vscale x 16 x i8> %x,
-                                            <vscale x 16 x i8> %y) #0 {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @luti6_i16_x4(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: luti6_i16_x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[1]
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z3.h, z4.h }, { z1, z2 }[1]
 ; CHECK-NEXT:    ret
-  %res = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>,
-                     <vscale x 8 x i16>, <vscale x 8 x i16> }
-      @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
-          <vscale x 8 x i16> %a, <vscale x 8 x i16> %b,
-          <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
-        <vscale x 8 x i16> } %res
+  %res = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
-         <vscale x 8 x bfloat> } @luti6_bf16_x4(<vscale x 8 x bfloat> %a,
-                                                <vscale x 8 x bfloat> %b,
-                                                <vscale x 16 x i8> %x,
-                                                <vscale x 16 x i8> %y) #0 {
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @luti6_bf16_x4(<vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: luti6_bf16_x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[0]
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z3.h, z4.h }, { z1, z2 }[0]
 ; CHECK-NEXT:    ret
-  %res = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
-                     <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }
-      @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(
-          <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b,
-          <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 0)
-  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
-        <vscale x 8 x bfloat> } %res
+  %res = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 0)
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
-         <vscale x 8 x half> } @luti6_f16_x4(<vscale x 8 x half> %a,
-                                             <vscale x 8 x half> %b,
-                                             <vscale x 16 x i8> %x,
-                                             <vscale x 16 x i8> %y) #0 {
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @luti6_f16_x4(<vscale x 8 x half> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: luti6_f16_x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[1]
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z3.h, z4.h }, { z1, z2 }[1]
 ; CHECK-NEXT:    ret
-  %res = tail call { <vscale x 8 x half>, <vscale x 8 x half>,
-                     <vscale x 8 x half>, <vscale x 8 x half> }
-      @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(
-          <vscale x 8 x half> %a, <vscale x 8 x half> %b,
-          <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1)
-  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
-        <vscale x 8 x half> } %res
+  %res = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
+  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 }
 
-declare <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(
-    i32, <vscale x 16 x i8>)
-declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
-          <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(
-    i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
-          <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
-    <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>,
-    <vscale x 16 x i8>, i32)
-declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
-          <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(
-    <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 16 x i8>,
-    <vscale x 16 x i8>, i32)
-declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
-          <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(
-    <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>,
-    <vscale x 16 x i8>, i32)
-
 attributes #0 = { "target-features"="+sme2p3" }
diff --git a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
index ab89e87df66d2..73cec7a570061 100644
--- a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
@@ -1,55 +1,37 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve2p3 < %s | FileCheck %s
 
-target triple = "aarch64-none-linux-gnu"
-
-define <vscale x 16 x i8> @luti6_i8(<vscale x 16 x i8> %a,
+define <vscale x 16 x i8> @luti6_i8(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: luti6_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
-; CHECK-NEXT:    luti6 z0.b, { z0.b, z1.b }, z2
+; CHECK-NEXT:    // kill: def $z0 killed $z0 def $z0_z1
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    luti6 z0.b, { z0.b, z1.b }, z0
 ; CHECK-NEXT:    ret
-                                    <vscale x 16 x i8> %b,
-                                    <vscale x 16 x i8> %idx) {
-  %res = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(
-      <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %idx)
+  %res = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
   ret <vscale x 16 x i8> %res
 }
 
 define <vscale x 8 x i16> @luti6_i16_x2(<vscale x 8 x i16> %a,
 ; CHECK-LABEL: luti6_i16_x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
-; CHECK-NEXT:    luti6 z0.h, { z0.h, z1.h }, z2[1]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    luti6 z0.h, { z2.h, z3.h }, z1[1]
 ; CHECK-NEXT:    ret
-                                        <vscale x 8 x i16> %b,
-                                        <vscale x 16 x i8> %idx) {
-  %res = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
-      <vscale x 8 x i16> %a, <vscale x 8 x i16> %b,
-      <vscale x 16 x i8> %idx, i32 1)
+                                        <vscale x 16 x i8> %b) {
+  %res = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, i32 1)
   ret <vscale x 8 x i16> %res
 }
 
 define <vscale x 8 x half> @luti6_f16_x2(<vscale x 8 x half> %a,
 ; CHECK-LABEL: luti6_f16_x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
-; CHECK-NEXT:    luti6 z0.h, { z0.h, z1.h }, z2[0]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    luti6 z0.h, { z2.h, z3.h }, z1[0]
 ; CHECK-NEXT:    ret
-                                         <vscale x 8 x half> %b,
-                                         <vscale x 16 x i8> %idx) {
-  %res = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
-      <vscale x 8 x half> %a, <vscale x 8 x half> %b,
-      <vscale x 16 x i8> %idx, i32 0)
+                                         <vscale x 16 x i8> %b) {
+  %res = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, i32 0)
   ret <vscale x 8 x half> %res
 }
-
-declare <vscale x 16 x i8> @llvm.aarch64.sve.luti6(
-    <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
-    <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32)
-declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
-    <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>, i32)
diff --git a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
index 7818dd19ffb1a..9c5869e84f783 100644
--- a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
+++ b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
@@ -1,80 +1,44 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: not opt -S -passes=verify < %s 2>&1 | FileCheck %s
 
-define <vscale x 8 x i16> @bad_sve_luti6_ret(<vscale x 16 x i8> %a,
-                                             <vscale x 16 x i8> %b,
-                                             <vscale x 16 x i8> %idx) {
+define <vscale x 8 x i16> @bad_sve_luti6_ret(<vscale x 16 x i8> %a) {
 ; CHECK: Intrinsic has incorrect return type!
-  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6(
-      <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %idx)
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
   ret <vscale x 8 x i16> %res
 }
 
-define <vscale x 8 x i16> @bad_sve_luti6_lane_x2_arg(<vscale x 4 x i32> %a,
-                                                     <vscale x 8 x i16> %b,
-                                                     <vscale x 16 x i8> %idx) {
+define <vscale x 8 x i16> @bad_sve_luti6_lane_x2_arg(<vscale x 4 x i32> %a, <vscale x 16 x i8> %b) {
 ; CHECK: Intrinsic has incorrect argument type!
-  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
-      <vscale x 4 x i32> %a, <vscale x 8 x i16> %b,
-      <vscale x 16 x i8> %idx, i32 1)
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 16 x i8> %b, i32 1)
   ret <vscale x 8 x i16> %res
 }
 
-define <vscale x 8 x half> @bad_sve_luti6_lane_x2_f16_arg(
-    <vscale x 8 x i16> %a, <vscale x 8 x half> %b, <vscale x 16 x i8> %idx) {
+define <vscale x 8 x half> @bad_sve_luti6_lane_x2_f16_arg(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
 ; CHECK: Intrinsic has incorrect argument type!
-  %res = call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
-      <vscale x 8 x i16> %a, <vscale x 8 x half> %b,
-      <vscale x 16 x i8> %idx, i32 1)
+  %res = call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, i32 1)
   ret <vscale x 8 x half> %res
 }
 
-define <vscale x 8 x i16> @bad_sme_luti6_zt_ret(i32 %zt,
-                                                <vscale x 16 x i8> %idx) {
+define <vscale x 8 x i16> @bad_sme_luti6_zt_ret(i32 %zt, <vscale x 16 x i8> %idx) {
 ; CHECK: Intrinsic has incorrect return type!
-  %res = call <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(
-      i32 %zt, <vscale x 16 x i8> %idx)
+  %res = call <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(i32 %zt, <vscale x 16 x i8> %idx)
   ret <vscale x 8 x i16> %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
-         <vscale x 8 x i16> } @bad_sme_luti6_zt_x4_ret(i32 %zt,
-                                                       <vscale x 16 x i8> %a,
-                                                       <vscale x 16 x i8> %b,
-                                                       <vscale x 16 x i8> %c) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @bad_sme_luti6_zt_x4_ret(i32 %zt, <vscale x 16 x i8> %a) {
 ; CHECK: Intrinsic has incorrect return type!
-  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
-                <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(
-      i32 %zt, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
-      <vscale x 16 x i8> %c)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
-        <vscale x 8 x i16> } %res
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(i32 %zt, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
-         <vscale x 8 x i16> } @bad_sme_luti6_lane_x4_arg(
-             <vscale x 8 x half> %a, <vscale x 8 x i16> %b,
-             <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @bad_sme_luti6_lane_x4_arg(<vscale x 8 x half> %a, <vscale x 16 x i8> %b) {
 ; CHECK: Intrinsic has incorrect argument type!
-  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
-                <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
-      <vscale x 8 x half> %a, <vscale x 8 x i16> %b,
-      <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
-        <vscale x 8 x i16> } %res
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6(
-    <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
-    <vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32)
-declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
-    <vscale x 8 x i16>, <vscale x 8 x half>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32)
 declare <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(i32, <vscale x 16 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
-          <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(
-    i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
-          <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
-    <vscale x 8 x half>, <vscale x 8 x i16>, <vscale x 16 x i8>,
-    <vscale x 16 x i8>, i32)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)

>From 808903bdc485ee3d5804895104c967fcfdef96e1 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 9 Apr 2026 14:29:21 +0100
Subject: [PATCH 10/22] fixup! More small PR fixes

---
 clang/include/clang/Basic/arm_sve.td          |  2 +-
 clang/lib/Sema/SemaARM.cpp                    |  3 +-
 .../acle_sve2p3_target_lane.c                 |  9 +++-
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  1 +
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 15 +++----
 .../AArch64/sme2p3-intrinsics-luti6.ll        |  5 +--
 .../AArch64/sve2p3-intrinsics-luti6.ll        | 13 +++---
 .../test/Verifier/AArch64/luti6-intrinsics.ll | 44 -------------------
 8 files changed, 23 insertions(+), 69 deletions(-)
 delete mode 100644 llvm/test/Verifier/AArch64/luti6-intrinsics.ll

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index d3526d4950a06..0067c02004f68 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1923,7 +1923,7 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
   def SVLUTI6 : SInst<"svluti6[_{d}]", "d2u", "cUcm", MergeNone, "aarch64_sve_luti6", [IsOverloadNone]>;
 }
 
-let SVETargetGuard = "sve2p3", SMETargetGuard = "sme2p3" in {
+let SVETargetGuard = "sve2p3", SMETargetGuard = "sve2p3|sme2p3" in {
   def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
 }
 
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index bb99f3fd0ce94..5e7504fab416d 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Sema/SemaARM.h"
-#include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
@@ -619,7 +618,7 @@ static bool checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
     else if (SatisfiesSME)
       BuiltinType = SemaARM::ArmStreaming;
     else
-      // This should be diagnosed by CodeGen.
+      // This should be diagnosed by CodeGen
       return false;
   }
 
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
index 6a2465f4027fc..6c70379ce3da5 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
@@ -5,10 +5,17 @@
 #include <arm_sve.h>
 
 svfloat16_t missing_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
-  return svluti6_lane_f16_x2(table, indices, 1); // expected-error {{'svluti6_lane_f16_x2' needs target feature (sve,sve2p3)|(sme,sme2p3)}}
+  return svluti6_lane_f16_x2(table, indices, 1); // expected-error {{'svluti6_lane_f16_x2' needs target feature (sve,sve2p3)|(sme,(sve2p3|sme2p3))}}
 }
 
 __attribute__((target("sve2p3")))
 svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
   return svluti6_lane_f16_x2(table, indices, 0);
 }
+
+__attribute__((target("sve2p3,sme")))
+svfloat16_t has_streaming_sve2p3_luti6_lane(svfloat16x2_t table,
+                                            svuint8_t indices)
+    __arm_streaming {
+  return svluti6_lane_f16_x2(table, indices, 1);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 8b6b9e403f16f..3f0f4d9e5028c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -4345,3 +4345,4 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_pmlal_pair_x2 : DefaultAttrsIntrinsic<[llvm_nxv2i64_ty, llvm_nxv2i64_ty],
       [llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty], [IntrNoMem]>;
 }
+
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 1821c407681a0..1e83116930ff1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2256,13 +2256,10 @@ void AArch64DAGToDAGISel::EmitMultiVectorLutiLane(SDNode *Node,
   EVT VT = Node->getValueType(0);
   bool HasChain = Node->getOpcode() == ISD::INTRINSIC_W_CHAIN;
 
-  SmallVector<SDValue, 4> MachineOps(Ops);
   SmallVector<EVT, 2> ResTys = {MVT::Untyped};
-  if (HasChain)
-    MachineOps.push_back(Node->getOperand(0));
   if (HasChain)
     ResTys.push_back(MVT::Other);
-  SDNode *Instruction = CurDAG->getMachineNode(Opc, DL, ResTys, MachineOps);
+  SDNode *Instruction = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
   SDValue SuperReg(Instruction, 0);
 
   for (unsigned i = 0; i < NumOutVecs; ++i)
@@ -2287,7 +2284,8 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
   if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
     return;
 
-  SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
+  SmallVector<SDValue, 4> Ops = {ZtValue, Node->getOperand(3),
+                                 Node->getOperand(4), Node->getOperand(0)};
   EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops);
 }
 
@@ -2295,10 +2293,9 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
                                                          unsigned NumOutVecs,
                                                          unsigned Opc,
                                                          uint32_t MaxImm) {
-  SDValue ImmVal = Node->getOperand(5);
-  if (auto *Imm = dyn_cast<ConstantSDNode>(ImmVal))
-    if (Imm->getZExtValue() > MaxImm)
-      return;
+  auto *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(5));
+  if (Imm && Imm->getZExtValue() > MaxImm)
+    return;
 
   SmallVector<SDValue, 4> Ops = {
       createZTuple({Node->getOperand(1), Node->getOperand(2)}),
diff --git a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
index 95055414ef562..7e6b9a280b254 100644
--- a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
@@ -6,8 +6,7 @@ define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    luti6 z0.b, zt0, z0
 ; CHECK-NEXT:    ret
-  %res = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(
-      i32 0, <vscale x 16 x i8> %x)
+  %res = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> %x)
   ret <vscale x 16 x i8> %res
 }
 
@@ -55,5 +54,3 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
   %res = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 }
-
-attributes #0 = { "target-features"="+sme2p3" }
diff --git a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
index 73cec7a570061..50b406a7e91b8 100644
--- a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
@@ -1,10 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve2p3 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve2p3 -enable-subreg-liveness < %s | FileCheck %s
 
 define <vscale x 16 x i8> @luti6_i8(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: luti6_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z0 killed $z0 def $z0_z1
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    luti6 z0.b, { z0.b, z1.b }, z0
 ; CHECK-NEXT:    ret
@@ -12,26 +11,24 @@ define <vscale x 16 x i8> @luti6_i8(<vscale x 16 x i8> %a) {
   ret <vscale x 16 x i8> %res
 }
 
-define <vscale x 8 x i16> @luti6_i16_x2(<vscale x 8 x i16> %a,
+define <vscale x 8 x i16> @luti6_i16_x2(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: luti6_i16_x2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    luti6 z0.h, { z2.h, z3.h }, z1[1]
 ; CHECK-NEXT:    ret
-                                        <vscale x 16 x i8> %b) {
   %res = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, i32 1)
   ret <vscale x 8 x i16> %res
 }
 
-define <vscale x 8 x half> @luti6_f16_x2(<vscale x 8 x half> %a,
+define <vscale x 8 x half> @luti6_f16_x2(<vscale x 8 x half> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: luti6_f16_x2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z2.d
+; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    luti6 z0.h, { z2.h, z3.h }, z1[0]
 ; CHECK-NEXT:    ret
-                                         <vscale x 16 x i8> %b) {
   %res = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, i32 0)
   ret <vscale x 8 x half> %res
 }
diff --git a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
deleted file mode 100644
index 9c5869e84f783..0000000000000
--- a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: not opt -S -passes=verify < %s 2>&1 | FileCheck %s
-
-define <vscale x 8 x i16> @bad_sve_luti6_ret(<vscale x 16 x i8> %a) {
-; CHECK: Intrinsic has incorrect return type!
-  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
-  ret <vscale x 8 x i16> %res
-}
-
-define <vscale x 8 x i16> @bad_sve_luti6_lane_x2_arg(<vscale x 4 x i32> %a, <vscale x 16 x i8> %b) {
-; CHECK: Intrinsic has incorrect argument type!
-  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 16 x i8> %b, i32 1)
-  ret <vscale x 8 x i16> %res
-}
-
-define <vscale x 8 x half> @bad_sve_luti6_lane_x2_f16_arg(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
-; CHECK: Intrinsic has incorrect argument type!
-  %res = call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, i32 1)
-  ret <vscale x 8 x half> %res
-}
-
-define <vscale x 8 x i16> @bad_sme_luti6_zt_ret(i32 %zt, <vscale x 16 x i8> %idx) {
-; CHECK: Intrinsic has incorrect return type!
-  %res = call <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(i32 %zt, <vscale x 16 x i8> %idx)
-  ret <vscale x 8 x i16> %res
-}
-
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @bad_sme_luti6_zt_x4_ret(i32 %zt, <vscale x 16 x i8> %a) {
-; CHECK: Intrinsic has incorrect return type!
-  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(i32 %zt, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
-}
-
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @bad_sme_luti6_lane_x4_arg(<vscale x 8 x half> %a, <vscale x 16 x i8> %b) {
-; CHECK: Intrinsic has incorrect argument type!
-  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
-}
-
-declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 16 x i8>, i32)
-declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32)
-declare <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(i32, <vscale x 16 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)

>From ae19a18d3cbb102aad225a1205326a2c62a9e167 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 9 Apr 2026 15:42:07 +0100
Subject: [PATCH 11/22] fixup! Fix more PR comments

---
 clang/include/clang/Basic/arm_sme.td          |  4 ++--
 clang/include/clang/Basic/arm_sve.td          |  2 +-
 .../sme2p3-intrinsics/acle_sme2p3_luti6.c     |  8 +++----
 llvm/include/llvm/IR/IntrinsicsAArch64.td     | 23 ++++++++-----------
 4 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 678fa1efc2a51..c79e6e2ae1f9a 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -982,8 +982,8 @@ let SMETargetGuard = "sme-lutv2" in {
 }
 
 let SMETargetGuard = "sme2p3" in {
-  def SVLUTI6_ZT      : SInst<"svluti6_zt_{d}", "diu", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
-  def SVLUTI6_ZT_X4   : SInst<"svluti6_zt_{d}_x4", "4i3.u", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVLUTI6_ZT      : SInst<"svluti6_zt_{d}", "di[", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVLUTI6_ZT_X4   : SInst<"svluti6_zt_{d}_x4", "4i3.[", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
 }
 
 let SMETargetGuard = "sme-f8f32" in {
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 0067c02004f68..14dcb822132b7 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1920,7 +1920,7 @@ let SVETargetGuard = "(sve2|sme2),lut", SMETargetGuard = "sme2,lut" in {
 }
 
 let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
-  def SVLUTI6 : SInst<"svluti6[_{d}]", "d2u", "cUcm", MergeNone, "aarch64_sve_luti6", [IsOverloadNone]>;
+  def SVLUTI6 : SInst<"svluti6[_{d}]", "d2[", "cUcm", MergeNone, "aarch64_sve_luti6", [IsOverloadNone]>;
 }
 
 let SVETargetGuard = "sve2p3", SMETargetGuard = "sve2p3|sme2p3" in {
diff --git a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
index 02dac71bb8de7..d9f3809207e6b 100644
--- a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
@@ -29,7 +29,7 @@
 //
 svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return SVE_ACLE_FUNC(svluti6_lane,_s16,_x4,)(table, indices, 1);
+  return SVE_ACLE_FUNC(svluti6_lane,_s16_x4,,)(table, indices, 1);
 }
 
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_u16_x4(
@@ -46,7 +46,7 @@ svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
 //
 svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return SVE_ACLE_FUNC(svluti6_lane,_u16,_x4,)(table, indices, 0);
+  return SVE_ACLE_FUNC(svluti6_lane,_u16_x4,,)(table, indices, 0);
 }
 
 // CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti6_lane_f16_x4(
@@ -63,7 +63,7 @@ svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
 //
 svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return SVE_ACLE_FUNC(svluti6_lane,_f16,_x4,)(table, indices, 1);
+  return SVE_ACLE_FUNC(svluti6_lane,_f16_x4,,)(table, indices, 1);
 }
 
 // CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti6_lane_bf16_x4(
@@ -80,7 +80,7 @@ svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
 //
 svbfloat16x4_t test_svluti6_lane_bf16_x4(svbfloat16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return SVE_ACLE_FUNC(svluti6_lane,_bf16,_x4,)(table, indices, 0);
+  return SVE_ACLE_FUNC(svluti6_lane,_bf16_x4,,)(table, indices, 0);
 }
 
 // CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_s8(
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 3f0f4d9e5028c..ca6fb98d80380 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1377,6 +1377,14 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  llvm_i32_ty],
                  !listconcat(Attrs, [IntrNoMem, ImmArg<ArgIndex<2>>])>;
 
+  class SVE2_LUTI_X2_Intrinsic<list<IntrinsicProperty> Attrs = []>
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 LLVMMatchType<0>,
+                 llvm_nxv16i8_ty,
+                 llvm_i32_ty],
+                !listconcat(Attrs, [IntrNoMem, ImmArg<ArgIndex<3>>])>;
+
   class SVE2_1VectorArg_Long_Intrinsic<list<IntrinsicProperty> Attrs = []>
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMSubdivide2VectorType<0>,
@@ -2819,19 +2827,8 @@ def int_aarch64_sve_luti6 : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                                    llvm_nxv16i8_ty,
                                    llvm_nxv16i8_ty],
                                   [IntrNoMem, IntrSpeculatable]>;
-def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                    [LLVMMatchType<0>,
-                                    LLVMMatchType<0>,
-                                    llvm_nxv16i8_ty,
-                                    llvm_i32_ty],
-                                    [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
-def int_aarch64_sve_luti6_lane_x2
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                            [LLVMMatchType<0>,
-                             LLVMMatchType<0>,
-                             llvm_nxv16i8_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
+def int_aarch64_sve_luti4_lane_x2 : SVE2_LUTI_X2_Intrinsic<[IntrSpeculatable]>;
+def int_aarch64_sve_luti6_lane_x2 : SVE2_LUTI_X2_Intrinsic<[IntrSpeculatable]>;
 
 //
 // SVE2 - Optional bit permutation

>From 6af8c68f042c538f8360d0dccce269e3c2eca4c8 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 9 Apr 2026 16:24:17 +0100
Subject: [PATCH 12/22] fixup! Adjust `def`s and split out tests

---
 clang/include/clang/Basic/arm_sme.td          |   1 +
 clang/include/clang/Basic/arm_sve.td          |   4 -
 .../sme2p3-intrinsics/acle_sme2p3_luti6.c     |  10 +-
 .../sve2p3-intrinsics/acle_sve2p3_luti6.c     | 158 ------------------
 .../acle_sve2p3_luti6_lane_x2.c               | 138 +++++++++++++++
 5 files changed, 144 insertions(+), 167 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6_lane_x2.c

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index c79e6e2ae1f9a..2c77510a71f6e 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -982,6 +982,7 @@ let SMETargetGuard = "sme-lutv2" in {
 }
 
 let SMETargetGuard = "sme2p3" in {
+  def SVLUTI6_X4      : SInst<"svluti6_lane_{d}_x4[_{1}_x2]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
   def SVLUTI6_ZT      : SInst<"svluti6_zt_{d}", "di[", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
   def SVLUTI6_ZT_X4   : SInst<"svluti6_zt_{d}_x4", "4i3.[", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
 }
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 14dcb822132b7..75884acf0c1ba 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1927,10 +1927,6 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = "sve2p3|sme2p3" in {
   def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
 }
 
-let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2p3" in {
-  def SVLUTI6_x4 : SInst<"svluti6_lane[_{d}_x4]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Optional
 
diff --git a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
index d9f3809207e6b..b6ef5226a9a82 100644
--- a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
@@ -10,7 +10,7 @@
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3_UNUSED,A4_UNUSED) A1
+#define SVE_ACLE_FUNC(A1,A2,A3_UNUSED,A4_UNUSED) A1##A2
 #else
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
@@ -29,7 +29,7 @@
 //
 svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return SVE_ACLE_FUNC(svluti6_lane,_s16_x4,,)(table, indices, 1);
+  return SVE_ACLE_FUNC(svluti6_lane,_s16_x4,_s16_x2,)(table, indices, 1);
 }
 
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_u16_x4(
@@ -46,7 +46,7 @@ svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
 //
 svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return SVE_ACLE_FUNC(svluti6_lane,_u16_x4,,)(table, indices, 0);
+  return SVE_ACLE_FUNC(svluti6_lane,_u16_x4,_u16_x2,)(table, indices, 0);
 }
 
 // CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti6_lane_f16_x4(
@@ -63,7 +63,7 @@ svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
 //
 svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return SVE_ACLE_FUNC(svluti6_lane,_f16_x4,,)(table, indices, 1);
+  return SVE_ACLE_FUNC(svluti6_lane,_f16_x4,_f16_x2,)(table, indices, 1);
 }
 
 // CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti6_lane_bf16_x4(
@@ -80,7 +80,7 @@ svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
 //
 svbfloat16x4_t test_svluti6_lane_bf16_x4(svbfloat16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return SVE_ACLE_FUNC(svluti6_lane,_bf16_x4,,)(table, indices, 0);
+  return SVE_ACLE_FUNC(svluti6_lane,_bf16_x4,_bf16_x2,)(table, indices, 0);
 }
 
 // CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_s8(
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
index 59d24b641f9d5..5e3a65566ffd7 100644
--- a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
@@ -5,10 +5,6 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSTREAMING_MODE -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s --check-prefix=STREAM-CHECK
-// RUN: %clang_cc1 -DSTREAMING_MODE -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=STREAM-CPP-CHECK
-// RUN: %clang_cc1 -DSTREAMING_MODE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s --check-prefix=STREAM-CHECK
-// RUN: %clang_cc1 -DSTREAMING_MODE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=STREAM-CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
 #include <arm_sve.h>
@@ -19,36 +15,18 @@
 #define SVE_ACLE_FUNC(A1, A2) A1##A2
 #endif
 
-#ifdef STREAMING_MODE
-#define STREAMING_ATTR __arm_streaming
-#else
-#define STREAMING_ATTR
-#endif
-
 // CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8(
 // CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-// STREAM-CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8(
-// STREAM-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
-// STREAM-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
-//
 // CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_s810svint8x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_s810svint8x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
-// STREAM-CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
-//
 svint8_t test_svluti6_s8(svint8x2_t table, svuint8_t indices) {
   return SVE_ACLE_FUNC(svluti6, _s8)(table, indices);
 }
@@ -59,24 +37,12 @@ svint8_t test_svluti6_s8(svint8x2_t table, svuint8_t indices) {
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-// STREAM-CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_u8(
-// STREAM-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
-// STREAM-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
-//
 // CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_u811svuint8x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_u811svuint8x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
-// STREAM-CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
-//
 svuint8_t test_svluti6_u8(svuint8x2_t table, svuint8_t indices) {
   return SVE_ACLE_FUNC(svluti6, _u8)(table, indices);
 }
@@ -87,136 +53,12 @@ svuint8_t test_svluti6_u8(svuint8x2_t table, svuint8_t indices) {
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-// STREAM-CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_mf8(
-// STREAM-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
-// STREAM-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
-//
 // CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svluti6_mf813svmfloat8x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svluti6_mf813svmfloat8x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
-// STREAM-CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
-//
 svmfloat8_t test_svluti6_mf8(svmfloat8x2_t table, svuint8_t indices) {
   return SVE_ACLE_FUNC(svluti6, _mf8)(table, indices);
 }
-
-// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
-// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
-// STREAM-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// STREAM-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
-// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// STREAM-CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) STREAMING_ATTR {
-  return SVE_ACLE_FUNC(svluti6_lane, _s16_x2)(table, indices, 1);
-}
-
-// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
-// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
-// STREAM-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// STREAM-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
-// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// STREAM-CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) STREAMING_ATTR {
-  return SVE_ACLE_FUNC(svluti6_lane, _u16_x2)(table, indices, 0);
-}
-
-// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
-// CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
-// STREAM-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// STREAM-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
-// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// STREAM-CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
-//
-svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) STREAMING_ATTR {
-  return SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(table, indices, 1);
-}
-
-// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svluti6_lane_bf16_x2(
-// CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svluti6_lane_bf16_x2(
-// STREAM-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// STREAM-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z25test_svluti6_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
-// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z25test_svluti6_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// STREAM-CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svluti6_lane_bf16_x2(svbfloat16x2_t table, svuint8_t indices) STREAMING_ATTR {
-  return SVE_ACLE_FUNC(svluti6_lane, _bf16_x2)(table, indices, 0);
-}
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6_lane_x2.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6_lane_x2.c
new file mode 100644
index 0000000000000..b6d8fe5cff531
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6_lane_x2.c
@@ -0,0 +1,138 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s --check-prefix=STREAM-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=STREAM-CPP-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s --check-prefix=STREAM-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=STREAM-CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1, A2) A1##A2
+#endif
+
+#ifdef STREAMING_MODE
+#define STREAMING_ATTR __arm_streaming
+#else
+#define STREAMING_ATTR
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) STREAMING_ATTR {
+  return SVE_ACLE_FUNC(svluti6_lane, _s16_x2)(table, indices, 1);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) STREAMING_ATTR {
+  return SVE_ACLE_FUNC(svluti6_lane, _u16_x2)(table, indices, 0);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
+// CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) STREAMING_ATTR {
+  return SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(table, indices, 1);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svluti6_lane_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svluti6_lane_bf16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z25test_svluti6_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z25test_svluti6_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svluti6_lane_bf16_x2(svbfloat16x2_t table, svuint8_t indices) STREAMING_ATTR {
+  return SVE_ACLE_FUNC(svluti6_lane, _bf16_x2)(table, indices, 0);
+}

>From 9b13d4e0b8bf257b637e2af397dc158aca1c4832 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 13 Apr 2026 13:56:43 +0100
Subject: [PATCH 13/22] fixup! Add some more _bf16 tests

---
 .../acle_sve2p3_imm.cpp                       |  3 +++
 .../acle_sve2p3_target_lane.c                 | 27 +++++++++++++++++++
 .../AArch64/sve2p3-intrinsics-luti6.ll        | 11 ++++++++
 3 files changed, 41 insertions(+)

diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp
index c284276f60e77..01781da390e0b 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp
@@ -102,6 +102,9 @@ void test_range_0_1() {
   // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
   SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(svcreate2_f16(svundef_f16(), svundef_f16()),
                                         svundef_u8(), -1);
+  // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+  SVE_ACLE_FUNC(svluti6_lane, _bf16_x2)(svcreate2_bf16(svundef_bf16(), svundef_bf16()),
+                                         svundef_u8(), 2);
 }
 
 void test_svdot_lane_x2_imm_0_7(svint16_t s16, svuint16_t u16, svint8_t s8,
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
index 6c70379ce3da5..846e72d154c60 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
@@ -13,9 +13,36 @@ svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
   return svluti6_lane_f16_x2(table, indices, 0);
 }
 
+__attribute__((target("sve2p3,bf16")))
+svbfloat16_t has_sve2p3_luti6_lane_bf16(svbfloat16x2_t table,
+                                        svuint8_t indices) {
+  return svluti6_lane_bf16_x2(table, indices, 1);
+}
+
 __attribute__((target("sve2p3,sme")))
 svfloat16_t has_streaming_sve2p3_luti6_lane(svfloat16x2_t table,
                                             svuint8_t indices)
     __arm_streaming {
   return svluti6_lane_f16_x2(table, indices, 1);
 }
+
+__attribute__((target("sve2p3,sme,bf16")))
+svbfloat16_t has_streaming_sve2p3_luti6_lane_bf16(svbfloat16x2_t table,
+                                                  svuint8_t indices)
+    __arm_streaming {
+  return svluti6_lane_bf16_x2(table, indices, 0);
+}
+
+__attribute__((target("sme2p3,sme")))
+svfloat16_t has_streaming_sme2p3_luti6_lane(svfloat16x2_t table,
+                                            svuint8_t indices)
+    __arm_streaming {
+  return svluti6_lane_f16_x2(table, indices, 0);
+}
+
+__attribute__((target("sme2p3,sme,bf16")))
+svbfloat16_t has_streaming_sme2p3_luti6_lane_bf16(svbfloat16x2_t table,
+                                                  svuint8_t indices)
+    __arm_streaming {
+  return svluti6_lane_bf16_x2(table, indices, 1);
+}
diff --git a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
index 50b406a7e91b8..a2bf43088968f 100644
--- a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
@@ -32,3 +32,14 @@ define <vscale x 8 x half> @luti6_f16_x2(<vscale x 8 x half> %a, <vscale x 16 x
   %res = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, i32 0)
   ret <vscale x 8 x half> %res
 }
+
+define <vscale x 8 x bfloat> @luti6_bf16_x2(<vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: luti6_bf16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    luti6 z0.h, { z2.h, z3.h }, z1[1]
+; CHECK-NEXT:    ret
+  %res = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b, i32 1)
+  ret <vscale x 8 x bfloat> %res
+}

>From 71290addc9ce3bbc0156d8fbc21563d0940d4cf1 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Tue, 14 Apr 2026 13:16:43 +0100
Subject: [PATCH 14/22] fixup! Address more PR comments

---
 clang/include/clang/Basic/arm_sme.td          |  1 -
 clang/include/clang/Basic/arm_sve.td          |  4 ++++
 .../acle_sme2p3_target.c                      |  6 ++---
 .../acle_sme2p3_target_lane.c                 | 16 -------------
 .../acle_sve2p3_target_lane.c                 | 24 ++++++++++++-------
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  5 +---
 .../AArch64/sme2p3-intrinsics-luti6.ll        | 10 ++++----
 7 files changed, 28 insertions(+), 38 deletions(-)
 delete mode 100644 clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 2c77510a71f6e..c79e6e2ae1f9a 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -982,7 +982,6 @@ let SMETargetGuard = "sme-lutv2" in {
 }
 
 let SMETargetGuard = "sme2p3" in {
-  def SVLUTI6_X4      : SInst<"svluti6_lane_{d}_x4[_{1}_x2]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
   def SVLUTI6_ZT      : SInst<"svluti6_zt_{d}", "di[", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
   def SVLUTI6_ZT_X4   : SInst<"svluti6_zt_{d}_x4", "4i3.[", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
 }
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 75884acf0c1ba..cf0502e3f255c 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1927,6 +1927,10 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = "sve2p3|sme2p3" in {
   def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
 }
 
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2p3" in {
+  def SVLUTI6_X4 : SInst<"svluti6_lane_{d}_x4[_{1}_x2]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Optional
 
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
index 2cffc1344bfe1..52ed761c05897 100644
--- a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
@@ -13,8 +13,8 @@ svint8_t has_sme2p3_zt(svuint8_t indices) __arm_streaming __arm_in("zt0") {
   return svluti6_zt_s8(0, indices);
 }
 
-__attribute__((target("sme2p3")))
-svfloat16_t has_sme2p3_implied_sme2p2(svbool_t pg, svfloat16_t op)
+__attribute__((target("sme2p3,bf16")))
+svbfloat16x4_t has_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return svcompact_f16(pg, op);
+  return svluti6_lane_bf16_x4(table, indices, 0);
 }
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
deleted file mode 100644
index 1a06663a9aab7..0000000000000
--- a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
+++ /dev/null
@@ -1,16 +0,0 @@
-// REQUIRES: aarch64-registered-target
-
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +bf16 -verify -emit-llvm -o - %s
-
-#include <arm_sme.h>
-
-svbfloat16x4_t missing_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
-    __arm_streaming {
-  return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature sme,sme2p3}}
-}
-
-__attribute__((target("sme2p3,bf16")))
-svbfloat16x4_t has_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
-    __arm_streaming {
-  return svluti6_lane_bf16_x4(table, indices, 0);
-}
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
index 846e72d154c60..139b240919bb8 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
@@ -14,9 +14,11 @@ svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
 }
 
 __attribute__((target("sve2p3,bf16")))
-svbfloat16_t has_sve2p3_luti6_lane_bf16(svbfloat16x2_t table,
-                                        svuint8_t indices) {
-  return svluti6_lane_bf16_x2(table, indices, 1);
+void has_sve2p3_luti6_lane_bf16(svfloat16x2_t f16_table,
+                                svbfloat16x2_t bf16_table,
+                                svuint8_t indices) {
+  (void)svluti6_lane_f16_x2(f16_table, indices, 0);
+  (void)svluti6_lane_bf16_x2(bf16_table, indices, 1);
 }
 
 __attribute__((target("sve2p3,sme")))
@@ -27,10 +29,12 @@ svfloat16_t has_streaming_sve2p3_luti6_lane(svfloat16x2_t table,
 }
 
 __attribute__((target("sve2p3,sme,bf16")))
-svbfloat16_t has_streaming_sve2p3_luti6_lane_bf16(svbfloat16x2_t table,
-                                                  svuint8_t indices)
+void has_streaming_sve2p3_luti6_lane_bf16(svfloat16x2_t f16_table,
+                                          svbfloat16x2_t bf16_table,
+                                          svuint8_t indices)
     __arm_streaming {
-  return svluti6_lane_bf16_x2(table, indices, 0);
+  (void)svluti6_lane_f16_x2(f16_table, indices, 1);
+  (void)svluti6_lane_bf16_x2(bf16_table, indices, 0);
 }
 
 __attribute__((target("sme2p3,sme")))
@@ -41,8 +45,10 @@ svfloat16_t has_streaming_sme2p3_luti6_lane(svfloat16x2_t table,
 }
 
 __attribute__((target("sme2p3,sme,bf16")))
-svbfloat16_t has_streaming_sme2p3_luti6_lane_bf16(svbfloat16x2_t table,
-                                                  svuint8_t indices)
+void has_streaming_sme2p3_luti6_lane_bf16(svfloat16x2_t f16_table,
+                                          svbfloat16x2_t bf16_table,
+                                          svuint8_t indices)
     __arm_streaming {
-  return svluti6_lane_bf16_x2(table, indices, 1);
+  (void)svluti6_lane_f16_x2(f16_table, indices, 0);
+  (void)svluti6_lane_bf16_x2(bf16_table, indices, 1);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4f49ef1b795ca..ae46201167dde 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4897,12 +4897,9 @@ let Predicates = [HasSVE2p3_or_SME2p3] in {
   defm SQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"sqshrn",   0b000, int_aarch64_sve_sqshrn_x2>;
   defm UQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"uqshrn",   0b010, int_aarch64_sve_uqshrn_x2>;
 
+  defm LUTI6_Z2ZZI        : sve2_luti6_vector_index<"luti6">;
 } // End HasSME2p3orSVE2p3
 
-let Predicates = [HasSVE2p3_or_SME2p3] in {
-  defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">;
-}
-
 //===----------------------------------------------------------------------===//
 // SVE2.3 instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
index 7e6b9a280b254..8cf13f7f0cd71 100644
--- a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -verify-machineinstrs -force-streaming -mtriple=aarch64-none-linux-gnu -mattr=+sme2p3 < %s | FileCheck %s
 
-define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 {
+define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) {
 ; CHECK-LABEL: luti6_zt_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    luti6 z0.b, zt0, z0
@@ -10,7 +10,7 @@ define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 {
   ret <vscale x 16 x i8> %res
 }
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @luti6_zt_i8_x4(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) #0 {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @luti6_zt_i8_x4(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) {
 ; CHECK-LABEL: luti6_zt_i8_x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    luti6 { z0.b - z3.b }, zt0, { z0 - z2 }
@@ -19,7 +19,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @luti6_i16_x4(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) #0 {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @luti6_i16_x4(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: luti6_i16_x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z3.d, z0.d
@@ -31,7 +31,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @luti6_bf16_x4(<vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b) #0 {
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @luti6_bf16_x4(<vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: luti6_bf16_x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z3.d, z0.d
@@ -43,7 +43,7 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <v
   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
 }
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @luti6_f16_x4(<vscale x 8 x half> %a, <vscale x 16 x i8> %b) #0 {
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @luti6_f16_x4(<vscale x 8 x half> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: luti6_f16_x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z3.d, z0.d

>From e875707f66e0f111f03bdb30dcf4d6e87f4e1322 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Tue, 14 Apr 2026 21:35:32 +0100
Subject: [PATCH 15/22] fixup! Move tests

---
 .../acle_sve2p3_target.c                      | 41 ++++++++++++--
 .../acle_sve2p3_target_lane.c                 | 54 -------------------
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  1 +
 3 files changed, 39 insertions(+), 57 deletions(-)
 delete mode 100644 clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c

diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c
index 3b5596ac1d5a6..15066c31af081 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c
@@ -13,7 +13,42 @@ svint8_t has_sve2p3_luti6(svint8x2_t table, svuint8_t indices) {
   return svluti6_s8(table, indices);
 }
 
-__attribute__((target("sve2p3")))
-svfloat32_t has_sve2p3_implied_sve2p2(svbool_t pg, svfloat16_t op) {
-  return svcvtlt_f32_f16_z(pg, op);
+__attribute__((target("sve2p3,bf16")))
+void has_sve2p3_luti6_lane_bf16(svfloat16x2_t f16_table,
+                                svbfloat16x2_t bf16_table,
+                                svuint8_t indices) {
+  (void)svluti6_lane_f16_x2(f16_table, indices, 0);
+  (void)svluti6_lane_bf16_x2(bf16_table, indices, 1);
+}
+
+__attribute__((target("sve2p3,sme")))
+svfloat16_t has_streaming_sve2p3_luti6_lane(svfloat16x2_t table,
+                                            svuint8_t indices)
+    __arm_streaming {
+  return svluti6_lane_f16_x2(table, indices, 1);
+}
+
+__attribute__((target("sve2p3,sme,bf16")))
+void has_streaming_sve2p3_luti6_lane_bf16(svfloat16x2_t f16_table,
+                                          svbfloat16x2_t bf16_table,
+                                          svuint8_t indices)
+    __arm_streaming {
+  (void)svluti6_lane_f16_x2(f16_table, indices, 1);
+  (void)svluti6_lane_bf16_x2(bf16_table, indices, 0);
+}
+
+__attribute__((target("sme2p3,sme")))
+svfloat16_t has_streaming_sme2p3_luti6_lane(svfloat16x2_t table,
+                                            svuint8_t indices)
+    __arm_streaming {
+  return svluti6_lane_f16_x2(table, indices, 0);
+}
+
+__attribute__((target("sme2p3,sme,bf16")))
+void has_streaming_sme2p3_luti6_lane_bf16(svfloat16x2_t f16_table,
+                                          svbfloat16x2_t bf16_table,
+                                          svuint8_t indices)
+    __arm_streaming {
+  (void)svluti6_lane_f16_x2(f16_table, indices, 0);
+  (void)svluti6_lane_bf16_x2(bf16_table, indices, 1);
 }
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
deleted file mode 100644
index 139b240919bb8..0000000000000
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// REQUIRES: aarch64-registered-target
-
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -emit-llvm -o - %s
-
-#include <arm_sve.h>
-
-svfloat16_t missing_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
-  return svluti6_lane_f16_x2(table, indices, 1); // expected-error {{'svluti6_lane_f16_x2' needs target feature (sve,sve2p3)|(sme,(sve2p3|sme2p3))}}
-}
-
-__attribute__((target("sve2p3")))
-svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
-  return svluti6_lane_f16_x2(table, indices, 0);
-}
-
-__attribute__((target("sve2p3,bf16")))
-void has_sve2p3_luti6_lane_bf16(svfloat16x2_t f16_table,
-                                svbfloat16x2_t bf16_table,
-                                svuint8_t indices) {
-  (void)svluti6_lane_f16_x2(f16_table, indices, 0);
-  (void)svluti6_lane_bf16_x2(bf16_table, indices, 1);
-}
-
-__attribute__((target("sve2p3,sme")))
-svfloat16_t has_streaming_sve2p3_luti6_lane(svfloat16x2_t table,
-                                            svuint8_t indices)
-    __arm_streaming {
-  return svluti6_lane_f16_x2(table, indices, 1);
-}
-
-__attribute__((target("sve2p3,sme,bf16")))
-void has_streaming_sve2p3_luti6_lane_bf16(svfloat16x2_t f16_table,
-                                          svbfloat16x2_t bf16_table,
-                                          svuint8_t indices)
-    __arm_streaming {
-  (void)svluti6_lane_f16_x2(f16_table, indices, 1);
-  (void)svluti6_lane_bf16_x2(bf16_table, indices, 0);
-}
-
-__attribute__((target("sme2p3,sme")))
-svfloat16_t has_streaming_sme2p3_luti6_lane(svfloat16x2_t table,
-                                            svuint8_t indices)
-    __arm_streaming {
-  return svluti6_lane_f16_x2(table, indices, 0);
-}
-
-__attribute__((target("sme2p3,sme,bf16")))
-void has_streaming_sme2p3_luti6_lane_bf16(svfloat16x2_t f16_table,
-                                          svbfloat16x2_t bf16_table,
-                                          svuint8_t indices)
-    __arm_streaming {
-  (void)svluti6_lane_f16_x2(f16_table, indices, 0);
-  (void)svluti6_lane_bf16_x2(bf16_table, indices, 1);
-}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 95b4bc86a3fe6..691f95affbef3 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -313,6 +313,7 @@ def HasNonStreamingSVE2p2_or_SME2p2
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2),
                 "sme2p2 or sve2p2">;
+
 def HasSMEF16F16_or_SMEF8F16
     : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">,
                 AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),

>From 38e73fcf0a4778157b21c0b8012aee11cb7010c2 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 16 Apr 2026 13:38:07 +0100
Subject: [PATCH 16/22] fixup! Adjust definitions after ACLE updates from
 @rockdreamer

---
 clang/include/clang/Basic/arm_sve.td          |  2 +-
 .../sve2p3-intrinsics/acle_sve2p3_luti6.c     | 24 +++++++++----------
 .../acle_sve2p3_target.c                      |  8 +++----
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index cf0502e3f255c..3bc01d80570bf 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1920,7 +1920,7 @@ let SVETargetGuard = "(sve2|sme2),lut", SMETargetGuard = "sme2,lut" in {
 }
 
 let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
-  def SVLUTI6 : SInst<"svluti6[_{d}]", "d2[", "cUcm", MergeNone, "aarch64_sve_luti6", [IsOverloadNone]>;
+  def SVLUTI6 : SInst<"svluti6[_{d}_x2]", "d2[", "cUcm", MergeNone, "aarch64_sve_luti6", [IsOverloadNone]>;
 }
 
 let SVETargetGuard = "sve2p3", SMETargetGuard = "sve2p3|sme2p3" in {
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
index 5e3a65566ffd7..11f0848af1c07 100644
--- a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
@@ -15,50 +15,50 @@
 #define SVE_ACLE_FUNC(A1, A2) A1##A2
 #endif
 
-// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8(
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8_x2(
 // CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_s810svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z18test_svluti6_s8_x210svint8x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svluti6_s8(svint8x2_t table, svuint8_t indices) {
-  return SVE_ACLE_FUNC(svluti6, _s8)(table, indices);
+svint8_t test_svluti6_s8_x2(svint8x2_t table, svuint8_t indices) {
+  return SVE_ACLE_FUNC(svluti6, _s8_x2)(table, indices);
 }
 
-// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_u8(
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_u8_x2(
 // CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_u811svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z18test_svluti6_u8_x211svuint8x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svluti6_u8(svuint8x2_t table, svuint8_t indices) {
-  return SVE_ACLE_FUNC(svluti6, _u8)(table, indices);
+svuint8_t test_svluti6_u8_x2(svuint8x2_t table, svuint8_t indices) {
+  return SVE_ACLE_FUNC(svluti6, _u8_x2)(table, indices);
 }
 
-// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_mf8(
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_mf8_x2(
 // CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svluti6_mf813svmfloat8x2_tu11__SVUint8_t(
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z19test_svluti6_mf8_x213svmfloat8x2_tu11__SVUint8_t(
 // CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svmfloat8_t test_svluti6_mf8(svmfloat8x2_t table, svuint8_t indices) {
-  return SVE_ACLE_FUNC(svluti6, _mf8)(table, indices);
+svmfloat8_t test_svluti6_mf8_x2(svmfloat8x2_t table, svuint8_t indices) {
+  return SVE_ACLE_FUNC(svluti6, _mf8_x2)(table, indices);
 }
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c
index 15066c31af081..88117cebfbb16 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c
@@ -4,13 +4,13 @@
 
 #include <arm_sve.h>
 
-void missing_sve2p3_luti6(svint8x2_t table, svuint8_t indices) {
-  svluti6_s8(table, indices); // expected-error {{'svluti6_s8' needs target feature sve,sve2p3}}
+void missing_sve2p3_luti6_x2(svint8x2_t table, svuint8_t indices) {
+  svluti6_s8_x2(table, indices); // expected-error {{'svluti6_s8_x2' needs target feature sve,sve2p3}}
 }
 
 __attribute__((target("sve2p3")))
-svint8_t has_sve2p3_luti6(svint8x2_t table, svuint8_t indices) {
-  return svluti6_s8(table, indices);
+svint8_t has_sve2p3_luti6_x2(svint8x2_t table, svuint8_t indices) {
+  return svluti6_s8_x2(table, indices);
 }
 
 __attribute__((target("sve2p3,bf16")))

>From 50c38e847c43977de1c0dcc6e2d43a731eb9d85e Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 18 May 2026 16:42:26 +0100
Subject: [PATCH 17/22] fixup! Adjust after ACLE changes to svluti6_lane_s16_x4

---
 clang/include/clang/Basic/arm_sve.td          |  3 +-
 .../sme2p3-intrinsics/acle_sme2p3_luti6.c     | 52 +++++++++++++++++--
 .../acle_sme2p3_imm.c                         | 16 +++---
 .../acle_sme2p3_target.c                      |  2 +-
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  4 ++
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 21 +++++---
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  2 +
 llvm/lib/Target/AArch64/SMEInstrFormats.td    | 16 ++++--
 8 files changed, 92 insertions(+), 24 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 3bc01d80570bf..5a67c241907c1 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1928,7 +1928,8 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = "sve2p3|sme2p3" in {
 }
 
 let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2p3" in {
-  def SVLUTI6_X4 : SInst<"svluti6_lane_{d}_x4[_{1}_x2]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI6_X4_U8X2 : SInst<"svluti6_lane_{d}_x4[_{1}_x2_u8_x2]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI6_X4_U8X3 : SInst<"svluti6_lane_{d}_x4[_{1}_x2_u8_x3]", "423.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4_x3", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
index b6ef5226a9a82..b320ca6883d73 100644
--- a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
@@ -29,7 +29,7 @@
 //
 svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return SVE_ACLE_FUNC(svluti6_lane,_s16_x4,_s16_x2,)(table, indices, 1);
+  return SVE_ACLE_FUNC(svluti6_lane,_s16_x4,_s16_x2_u8_x2,)(table, indices, 1);
 }
 
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_u16_x4(
@@ -46,7 +46,7 @@ svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
 //
 svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return SVE_ACLE_FUNC(svluti6_lane,_u16_x4,_u16_x2,)(table, indices, 0);
+  return SVE_ACLE_FUNC(svluti6_lane,_u16_x4,_u16_x2_u8_x2,)(table, indices, 0);
 }
 
 // CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti6_lane_f16_x4(
@@ -63,7 +63,7 @@ svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
 //
 svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return SVE_ACLE_FUNC(svluti6_lane,_f16_x4,_f16_x2,)(table, indices, 1);
+  return SVE_ACLE_FUNC(svluti6_lane,_f16_x4,_f16_x2_u8_x2,)(table, indices, 1);
 }
 
 // CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti6_lane_bf16_x4(
@@ -80,7 +80,51 @@ svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
 //
 svbfloat16x4_t test_svluti6_lane_bf16_x4(svbfloat16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return SVE_ACLE_FUNC(svluti6_lane,_bf16_x4,_bf16_x2,)(table, indices, 0);
+  return SVE_ACLE_FUNC(svluti6_lane,_bf16_x4,_bf16_x2_u8_x2,)(table, indices, 0);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_s16_x4_u8_x3(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]], i32 1)
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
+//
+svint16x4_t test_svluti6_lane_s16_x4_u8_x3(svint16x2_t table, svuint8x3_t indices)
+    __arm_streaming {
+  return SVE_ACLE_FUNC(svluti6_lane,_s16_x4,_s16_x2_u8_x3,)(table, indices, 1);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_u16_x4_u8_x3(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
+//
+svuint16x4_t test_svluti6_lane_u16_x4_u8_x3(svuint16x2_t table, svuint8x3_t indices)
+    __arm_streaming {
+  return SVE_ACLE_FUNC(svluti6_lane,_u16_x4,_u16_x2_u8_x3,)(table, indices, 0);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti6_lane_f16_x4_u8_x3(
+// CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]], i32 1)
+// CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
+//
+svfloat16x4_t test_svluti6_lane_f16_x4_u8_x3(svfloat16x2_t table, svuint8x3_t indices)
+    __arm_streaming {
+  return SVE_ACLE_FUNC(svluti6_lane,_f16_x4,_f16_x2_u8_x3,)(table, indices, 1);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti6_lane_bf16_x4_u8_x3(
+// CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]], i32 0)
+// CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x4_t test_svluti6_lane_bf16_x4_u8_x3(svbfloat16x2_t table, svuint8x3_t indices)
+    __arm_streaming {
+  return SVE_ACLE_FUNC(svluti6_lane,_bf16_x4,_bf16_x2_u8_x3,)(table, indices, 0);
 }
 
 // CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_s8(
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c
index 8883ea3580fb2..25c35fbcbcc7b 100644
--- a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c
@@ -10,12 +10,12 @@ void test_range_0_0(void) __arm_streaming __arm_in("zt0") {
 }
 
 void test_range_0_1(void) __arm_streaming {
-  svluti6_lane_s16_x4(svcreate2_s16(svundef_s16(), svundef_s16()), // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
-                      svcreate2_u8(svundef_u8(), svundef_u8()), -1);
-  svluti6_lane_u16_x4(svcreate2_u16(svundef_u16(), svundef_u16()), // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-                      svcreate2_u8(svundef_u8(), svundef_u8()), 2);
-  svluti6_lane_f16_x4(svcreate2_f16(svundef_f16(), svundef_f16()), // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
-                      svcreate2_u8(svundef_u8(), svundef_u8()), -1);
-  svluti6_lane_bf16_x4(svcreate2_bf16(svundef_bf16(), svundef_bf16()), // expected-error {{argument value 2 is outside the valid range [0, 1]}}
-                       svcreate2_u8(svundef_u8(), svundef_u8()), 2);
+  svluti6_lane_s16_x4_s16_x2_u8_x2(svcreate2_s16(svundef_s16(), svundef_s16()), // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+                                   svcreate2_u8(svundef_u8(), svundef_u8()), -1);
+  svluti6_lane_u16_x4_u16_x2_u8_x2(svcreate2_u16(svundef_u16(), svundef_u16()), // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+                                   svcreate2_u8(svundef_u8(), svundef_u8()), 2);
+  svluti6_lane_f16_x4_f16_x2_u8_x3(svcreate2_f16(svundef_f16(), svundef_f16()), // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+                                   svcreate3_u8(svundef_u8(), svundef_u8(), svundef_u8()), -1);
+  svluti6_lane_bf16_x4_bf16_x2_u8_x3(svcreate2_bf16(svundef_bf16(), svundef_bf16()), // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+                                     svcreate3_u8(svundef_u8(), svundef_u8(), svundef_u8()), 2);
 }
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
index 52ed761c05897..02b629e766b57 100644
--- a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
@@ -16,5 +16,5 @@ svint8_t has_sme2p3_zt(svuint8_t indices) __arm_streaming __arm_in("zt0") {
 __attribute__((target("sme2p3,bf16")))
 svbfloat16x4_t has_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
     __arm_streaming {
-  return svluti6_lane_bf16_x4(table, indices, 0);
+  return svluti6_lane_bf16_x4_bf16_x2_u8_x2(table, indices, 0);
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index ca6fb98d80380..05fbe3f440b9e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -4016,6 +4016,10 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                             [LLVMMatchType<0>, LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
                             [ImmArg<ArgIndex<4>>, IntrNoMem, IntrSpeculatable]>;
+  def int_aarch64_sme_luti6_lane_x4_x3
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
+                            [ImmArg<ArgIndex<5>>, IntrNoMem, IntrSpeculatable]>;
 
   def int_aarch64_sme_luti4_zt_x4
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 1e83116930ff1..d668871533c22 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -418,7 +418,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   void EmitMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
                                ArrayRef<SDValue> Ops);
   void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs,
-                                      unsigned Opc, uint32_t MaxImm);
+                                      unsigned Opc, uint32_t MaxImm,
+                                      unsigned NumIndexVecs);
 
   void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
                              unsigned NumInVecs);
@@ -2292,15 +2293,18 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
 void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
                                                          unsigned NumOutVecs,
                                                          unsigned Opc,
-                                                         uint32_t MaxImm) {
-  auto *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(5));
+                                                         uint32_t MaxImm,
+                                                         unsigned NumIndexVecs) {
+  unsigned ImmOp = 3 + NumIndexVecs;
+  auto *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(ImmOp));
   if (Imm && Imm->getZExtValue() > MaxImm)
     return;
 
+  SmallVector<SDValue, 3> IndexRegs(Node->ops().slice(3, NumIndexVecs));
   SmallVector<SDValue, 4> Ops = {
       createZTuple({Node->getOperand(1), Node->getOperand(2)}),
-      createZTuple({Node->getOperand(3), Node->getOperand(4)}),
-      Node->getOperand(5),
+      createZTuple(IndexRegs),
+      Node->getOperand(ImmOp),
   };
   EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops);
 }
@@ -6119,7 +6123,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_sme_luti6_lane_x4:
       if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
               Node->getValueType(0), {0, AArch64::LUTI6_4Z2Z2ZI, 0}))
-        SelectMultiVectorLutiLaneTuple(Node, 4, Opc, 1);
+        SelectMultiVectorLutiLaneTuple(Node, 4, Opc, 1, 2);
+      return;
+    case Intrinsic::aarch64_sme_luti6_lane_x4_x3:
+      if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
+              Node->getValueType(0), {0, AArch64::LUTI6_4Z2Z3ZI, 0}))
+        SelectMultiVectorLutiLaneTuple(Node, 4, Opc, 1, 3);
       return;
     case Intrinsic::aarch64_sve_urshl_single_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index d0eb9ca218a27..75a5d7f640c8b 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -1144,5 +1144,7 @@ let Predicates = [HasSME2p3] in {
   def LUTI6_4ZT3Z     : sme2_luti6_zt_consecutive<"luti6">;
   def LUTI6_S_4ZT3Z   : sme2_luti6_zt_strided<"luti6">;
   def LUTI6_4Z2Z2ZI   : sme2_luti6_vector_vg4_consecutive<"luti6">;
+  let isCodeGenOnly = 1 in
+  def LUTI6_4Z2Z3ZI   : sme2_luti6_vector_vg4_consecutive_x3<"luti6">;
   def LUTI6_S_4Z2Z2ZI : sme2_luti6_vector_vg4_strided<"luti6">;
 } // [HasSME2p3]
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 31fff8767fbdd..78012790bd0d9 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -3967,8 +3967,9 @@ class sme2_luti6_zt_strided<string asm>
 
 //===----------------------------------------------------------------------===//
 // Lookup table read with 6-bit indices (8-bit)
-class sme2_luti6_vector_vg4_base<RegisterOperand zd_ty, string asm>
-  : I<(outs zd_ty:$Zd), (ins ZZ_h:$Zn, ZZ_Any:$Zm, VectorIndexD:$i1),
+class sme2_luti6_vector_vg4_base<RegisterOperand zd_ty,
+                                 RegisterOperand zm_ty, string asm>
+  : I<(outs zd_ty:$Zd), (ins ZZ_h:$Zn, zm_ty:$Zm, VectorIndexD:$i1),
     asm, "\t$Zd, $Zn, $Zm$i1", "", []>, Sched<[]> {
   bits<3> Zd;
   bits<5> Zn;
@@ -3982,14 +3983,21 @@ class sme2_luti6_vector_vg4_base<RegisterOperand zd_ty, string asm>
 }
 
 class sme2_luti6_vector_vg4_consecutive<string asm>
-  : sme2_luti6_vector_vg4_base<ZZZZ_h_mul_r, asm> {
+  : sme2_luti6_vector_vg4_base<ZZZZ_h_mul_r, ZZ_Any, asm> {
+  let Inst{15-10} = 0b111101;
+  let Inst{4-2}   = Zd;
+  let Inst{1-0}   = 0b00;
+}
+
+class sme2_luti6_vector_vg4_consecutive_x3<string asm>
+  : sme2_luti6_vector_vg4_base<ZZZZ_h_mul_r, ZZZ_Any, asm> {
   let Inst{15-10} = 0b111101;
   let Inst{4-2}   = Zd;
   let Inst{1-0}   = 0b00;
 }
 
 class sme2_luti6_vector_vg4_strided<string asm>
-  : sme2_luti6_vector_vg4_base<ZZZZ_h_strided, asm> {
+  : sme2_luti6_vector_vg4_base<ZZZZ_h_strided, ZZ_Any, asm> {
   let Inst{15-10} = 0b111111;
   let Inst{4}     = Zd{2};
   let Inst{3-2}   = 0b00;

>From 2eff7b06bef8a98a8b1047fe0331c503605a1b9c Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 28 May 2026 15:35:14 +0100
Subject: [PATCH 18/22] fixup! Amend after PR comments

---
 llvm/include/llvm/IR/IntrinsicsAArch64.td     | 19 +++++++++++--------
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  5 ++---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  2 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  8 ++++----
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 05fbe3f440b9e..1a56bab438f2d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1370,7 +1370,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMVectorOfBitcastsToInt<0>],
                 !listconcat(Attrs, [IntrNoMem])>;
 
-  class SVE2_LUTI_Inrinsic<list<IntrinsicProperty> Attrs = []>
+  class SVE2_LUTI_Intrinsic<list<IntrinsicProperty> Attrs = []>
     :  DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  llvm_nxv16i8_ty,
@@ -1385,6 +1385,13 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  llvm_i32_ty],
                 !listconcat(Attrs, [IntrNoMem, ImmArg<ArgIndex<3>>])>;
 
+  class SVE2_LUTI6_Intrinsic<list<IntrinsicProperty> Attrs = []>
+    : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
+                [llvm_nxv16i8_ty,
+                 llvm_nxv16i8_ty,
+                 llvm_nxv16i8_ty],
+                !listconcat(Attrs, [IntrNoMem])>;
+
   class SVE2_1VectorArg_Long_Intrinsic<list<IntrinsicProperty> Attrs = []>
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMSubdivide2VectorType<0>,
@@ -2820,13 +2827,9 @@ def int_aarch64_sve_tbx  : AdvSIMD_SVE2_TBX_Intrinsic<[IntrSpeculatable]>;
 // SVE2 - Lookup Table
 //
 
-def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic<[IntrSpeculatable]>;
-def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic<[IntrSpeculatable]>;
-def int_aarch64_sve_luti6 : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
-                                  [llvm_nxv16i8_ty,
-                                   llvm_nxv16i8_ty,
-                                   llvm_nxv16i8_ty],
-                                  [IntrNoMem, IntrSpeculatable]>;
+def int_aarch64_sve_luti2_lane : SVE2_LUTI_Intrinsic<[IntrSpeculatable]>;
+def int_aarch64_sve_luti4_lane : SVE2_LUTI_Intrinsic<[IntrSpeculatable]>;
+def int_aarch64_sve_luti6 : SVE2_LUTI6_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_luti4_lane_x2 : SVE2_LUTI_X2_Intrinsic<[IntrSpeculatable]>;
 def int_aarch64_sve_luti6_lane_x2 : SVE2_LUTI_X2_Intrinsic<[IntrSpeculatable]>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index d668871533c22..a32166dc39dc8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2304,8 +2304,7 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
   SmallVector<SDValue, 4> Ops = {
       createZTuple({Node->getOperand(1), Node->getOperand(2)}),
       createZTuple(IndexRegs),
-      Node->getOperand(ImmOp),
-  };
+      Node->getOperand(ImmOp)};
   EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops);
 }
 
@@ -2318,7 +2317,7 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
     return;
 
   SmallVector<SDValue, 4> Regs(Node->ops().slice(3, NumInVecs));
-  SDValue ZTuple = NumInVecs == 2 ? createZMulTuple(Regs) : createZTuple(Regs);
+  SDValue ZTuple = NumInVecs == 3 ? createZTuple(Regs) : createZMulTuple(Regs);
   SDValue Ops[] = {ZtValue, ZTuple, Node->getOperand(0)};
 
   SDLoc DL(Node);
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index ae46201167dde..4548ea13a6b0d 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4897,7 +4897,7 @@ let Predicates = [HasSVE2p3_or_SME2p3] in {
   defm SQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"sqshrn",   0b000, int_aarch64_sve_sqshrn_x2>;
   defm UQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"uqshrn",   0b010, int_aarch64_sve_uqshrn_x2>;
 
-  defm LUTI6_Z2ZZI        : sve2_luti6_vector_index<"luti6">;
+  defm LUTI6_Z2ZZI        : sve2_luti6_vector_index<"luti6", int_aarch64_sve_luti6_lane_x2>;
 } // End HasSME2p3orSVE2p3
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index a16e364fb6d32..822c38569c1cd 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11415,23 +11415,23 @@ multiclass sve2_luti4_vector_vg2_index<string mnemonic> {
 }
 
 // Look up table read with 6-bit indices
-multiclass sve2_luti6_vector_index<string mnemonic> {
+multiclass sve2_luti6_vector_index<string mnemonic, SDPatternOperator intrinsic> {
   def _H : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexD32b, 0b1011, mnemonic> {
     bit idx;
     let Inst{23} = idx;
   }
 
-  def : Pat<(nxv8i16 (int_aarch64_sve_luti6_lane_x2 nxv8i16:$Op1, nxv8i16:$Op2,
+  def : Pat<(nxv8i16 (intrinsic nxv8i16:$Op1, nxv8i16:$Op2,
                       nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
             (nxv8i16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
                                                                       nxv8i16:$Op2, zsub1),
                                                 nxv16i8:$Op3, timm32_0_1:$Op4))>;
-  def : Pat<(nxv8f16 (int_aarch64_sve_luti6_lane_x2 nxv8f16:$Op1, nxv8f16:$Op2,
+  def : Pat<(nxv8f16 (intrinsic nxv8f16:$Op1, nxv8f16:$Op2,
                       nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
             (nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
                                                                       nxv8f16:$Op2, zsub1),
                                                 nxv16i8:$Op3, timm32_0_1:$Op4))>;
-  def : Pat<(nxv8bf16 (int_aarch64_sve_luti6_lane_x2 nxv8bf16:$Op1, nxv8bf16:$Op2,
+  def : Pat<(nxv8bf16 (intrinsic nxv8bf16:$Op1, nxv8bf16:$Op2,
                        nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
             (nxv8bf16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0,
                                                                         nxv8bf16:$Op2, zsub1),

>From 0050b3e8bf96f34ee93530b3d75f155e1d12f575 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 28 May 2026 15:39:11 +0100
Subject: [PATCH 19/22] fixup! Don't modify SelectMultiVectorLutiLane

---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 83 +++++++++----------
 1 file changed, 38 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index a32166dc39dc8..09a9e3e5cc558 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -415,11 +415,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
 
   void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
                                  unsigned Opc, uint32_t MaxImm);
-  void EmitMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
-                               ArrayRef<SDValue> Ops);
-  void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs,
-                                      unsigned Opc, uint32_t MaxImm,
-                                      unsigned NumIndexVecs);
+  void SelectMultiVectorLuti6LaneX4(SDNode *Node, unsigned Opc,
+                                    unsigned NumIndexVecs);
 
   void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
                              unsigned NumInVecs);
@@ -2249,30 +2246,6 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
   SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode);
 }
 
-void AArch64DAGToDAGISel::EmitMultiVectorLutiLane(SDNode *Node,
-                                                  unsigned NumOutVecs,
-                                                  unsigned Opc,
-                                                  ArrayRef<SDValue> Ops) {
-  SDLoc DL(Node);
-  EVT VT = Node->getValueType(0);
-  bool HasChain = Node->getOpcode() == ISD::INTRINSIC_W_CHAIN;
-
-  SmallVector<EVT, 2> ResTys = {MVT::Untyped};
-  if (HasChain)
-    ResTys.push_back(MVT::Other);
-  SDNode *Instruction = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
-  SDValue SuperReg(Instruction, 0);
-
-  for (unsigned i = 0; i < NumOutVecs; ++i)
-    ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg(
-                                      AArch64::zsub0 + i, DL, VT, SuperReg));
-
-  if (HasChain)
-    ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1));
-
-  CurDAG->RemoveDeadNode(Node);
-}
-
 void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
                                                     unsigned NumOutVecs,
                                                     unsigned Opc,
@@ -2285,27 +2258,47 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
   if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
     return;
 
-  SmallVector<SDValue, 4> Ops = {ZtValue, Node->getOperand(3),
-                                 Node->getOperand(4), Node->getOperand(0)};
-  EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops);
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4), Chain};
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+
+  SDNode *Instruction =
+      CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
+  SDValue SuperReg = SDValue(Instruction, 0);
+
+  for (unsigned I = 0; I < NumOutVecs; ++I)
+    ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
+                                      AArch64::zsub0 + I, DL, VT, SuperReg));
+
+  // Copy chain
+  unsigned ChainIdx = NumOutVecs;
+  ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1));
+  CurDAG->RemoveDeadNode(Node);
 }
 
-void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
-                                                         unsigned NumOutVecs,
-                                                         unsigned Opc,
-                                                         uint32_t MaxImm,
-                                                         unsigned NumIndexVecs) {
+void AArch64DAGToDAGISel::SelectMultiVectorLuti6LaneX4(SDNode *Node,
+                                                       unsigned Opc,
+                                                       unsigned NumIndexVecs) {
   unsigned ImmOp = 3 + NumIndexVecs;
   auto *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(ImmOp));
-  if (Imm && Imm->getZExtValue() > MaxImm)
+  if (Imm && Imm->getZExtValue() > 1)
     return;
 
   SmallVector<SDValue, 3> IndexRegs(Node->ops().slice(3, NumIndexVecs));
-  SmallVector<SDValue, 4> Ops = {
-      createZTuple({Node->getOperand(1), Node->getOperand(2)}),
-      createZTuple(IndexRegs),
-      Node->getOperand(ImmOp)};
-  EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops);
+  SDValue Ops[] = {createZTuple({Node->getOperand(1), Node->getOperand(2)}),
+                   createZTuple(IndexRegs), Node->getOperand(ImmOp)};
+
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+  SDNode *Instruction = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops);
+  SDValue SuperReg(Instruction, 0);
+
+  for (unsigned I = 0; I < 4; ++I)
+    ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
+                                      AArch64::zsub0 + I, DL, VT, SuperReg));
+
+  CurDAG->RemoveDeadNode(Node);
 }
 
 void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
@@ -6122,12 +6115,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_sme_luti6_lane_x4:
       if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
               Node->getValueType(0), {0, AArch64::LUTI6_4Z2Z2ZI, 0}))
-        SelectMultiVectorLutiLaneTuple(Node, 4, Opc, 1, 2);
+        SelectMultiVectorLuti6LaneX4(Node, Opc, 2);
       return;
     case Intrinsic::aarch64_sme_luti6_lane_x4_x3:
       if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
               Node->getValueType(0), {0, AArch64::LUTI6_4Z2Z3ZI, 0}))
-        SelectMultiVectorLutiLaneTuple(Node, 4, Opc, 1, 3);
+        SelectMultiVectorLuti6LaneX4(Node, Opc, 3);
       return;
     case Intrinsic::aarch64_sve_urshl_single_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(

>From 1b713ae9d112352f86c84e589a65c56043ee175c Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 28 May 2026 15:52:27 +0100
Subject: [PATCH 20/22] fixup! Run
 `clang/utils/aarch64_builtins_test_generator.py`

---
 .../arm_sme_streaming_only_sme_AND_sme2p3.c   |  56 +++++++++
 ..._sve2p3___sme_AND_LP_sve2p3_OR_sme2p3_RP.c |  77 ++++++++++++
 ...rm_sve_non_streaming_only_sve_AND_sve2p3.c |  62 +++++++++
 .../arm_sve_streaming_only_sme_AND_sme2p3.c   | 118 ++++++++++++++++++
 4 files changed, 313 insertions(+)
 create mode 100644 clang/test/Sema/AArch64/arm_sme_streaming_only_sme_AND_sme2p3.c
 create mode 100644 clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_sve2p3___sme_AND_LP_sve2p3_OR_sme2p3_RP.c
 create mode 100644 clang/test/Sema/AArch64/arm_sve_non_streaming_only_sve_AND_sve2p3.c
 create mode 100644 clang/test/Sema/AArch64/arm_sve_streaming_only_sme_AND_sme2p3.c

diff --git a/clang/test/Sema/AArch64/arm_sme_streaming_only_sme_AND_sme2p3.c b/clang/test/Sema/AArch64/arm_sme_streaming_only_sme_AND_sme2p3.c
new file mode 100644
index 0000000000000..2ab8d4d0c4120
--- /dev/null
+++ b/clang/test/Sema/AArch64/arm_sme_streaming_only_sme_AND_sme2p3.c
@@ -0,0 +1,56 @@
+// NOTE: File has been autogenerated by utils/aarch64_builtins_test_generator.py
+// RUN: %clang_cc1 %s -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +sve -verify=streaming-guard
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sme.h>
+
+// Properties: guard="" streaming_guard="sme,sme2p3" flags="streaming-only,requires-zt"
+
+void test(void) __arm_inout("zt0"){
+  svuint8_t svuint8_t_val;
+  svuint8x3_t svuint8x3_t_val;
+
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_zt_mf8(0, svuint8_t_val);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_zt_mf8_x4(0, svuint8x3_t_val);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_zt_s8(0, svuint8_t_val);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_zt_s8_x4(0, svuint8x3_t_val);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_zt_u8(0, svuint8_t_val);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_zt_u8_x4(0, svuint8x3_t_val);
+}
+
+void test_streaming(void) __arm_streaming __arm_inout("zt0"){
+  svuint8_t svuint8_t_val;
+  svuint8x3_t svuint8x3_t_val;
+
+  svluti6_zt_mf8(0, svuint8_t_val);
+  svluti6_zt_mf8_x4(0, svuint8x3_t_val);
+  svluti6_zt_s8(0, svuint8_t_val);
+  svluti6_zt_s8_x4(0, svuint8x3_t_val);
+  svluti6_zt_u8(0, svuint8_t_val);
+  svluti6_zt_u8_x4(0, svuint8x3_t_val);
+}
+
+void test_streaming_compatible(void) __arm_streaming_compatible __arm_inout("zt0"){
+  svuint8_t svuint8_t_val;
+  svuint8x3_t svuint8x3_t_val;
+
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_zt_mf8(0, svuint8_t_val);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_zt_mf8_x4(0, svuint8x3_t_val);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_zt_s8(0, svuint8_t_val);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_zt_s8_x4(0, svuint8x3_t_val);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_zt_u8(0, svuint8_t_val);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_zt_u8_x4(0, svuint8x3_t_val);
+}
diff --git a/clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_sve2p3___sme_AND_LP_sve2p3_OR_sme2p3_RP.c b/clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_sve2p3___sme_AND_LP_sve2p3_OR_sme2p3_RP.c
new file mode 100644
index 0000000000000..1918990b4153e
--- /dev/null
+++ b/clang/test/Sema/AArch64/arm_sve_feature_dependent_sve_AND_sve2p3___sme_AND_LP_sve2p3_OR_sme2p3_RP.c
@@ -0,0 +1,77 @@
+// NOTE: File has been autogenerated by utils/aarch64_builtins_test_generator.py
+// RUN: %clang_cc1 %s -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2p3 -target-feature +sve -verify=streaming-guard
+// RUN: %clang_cc1 %s -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +sve2p3 -verify
+// expected-no-diagnostics
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+// Properties: guard="sve,sve2p3" streaming_guard="sme,(sve2p3|sme2p3)" flags="feature-dependent"
+
+void test(void) {
+  svbfloat16x2_t svbfloat16x2_t_val;
+  svfloat16x2_t svfloat16x2_t_val;
+  svint16x2_t svint16x2_t_val;
+  svuint8_t svuint8_t_val;
+  svuint16x2_t svuint16x2_t_val;
+
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane(svbfloat16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane(svfloat16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane(svint16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane(svuint16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_bf16_x2(svbfloat16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_f16_x2(svfloat16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_s16_x2(svint16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_u16_x2(svuint16x2_t_val, svuint8_t_val, 1);
+}
+
+void test_streaming(void) __arm_streaming{
+  svbfloat16x2_t svbfloat16x2_t_val;
+  svfloat16x2_t svfloat16x2_t_val;
+  svint16x2_t svint16x2_t_val;
+  svuint8_t svuint8_t_val;
+  svuint16x2_t svuint16x2_t_val;
+
+  svluti6_lane(svbfloat16x2_t_val, svuint8_t_val, 1);
+  svluti6_lane(svfloat16x2_t_val, svuint8_t_val, 1);
+  svluti6_lane(svint16x2_t_val, svuint8_t_val, 1);
+  svluti6_lane(svuint16x2_t_val, svuint8_t_val, 1);
+  svluti6_lane_bf16_x2(svbfloat16x2_t_val, svuint8_t_val, 1);
+  svluti6_lane_f16_x2(svfloat16x2_t_val, svuint8_t_val, 1);
+  svluti6_lane_s16_x2(svint16x2_t_val, svuint8_t_val, 1);
+  svluti6_lane_u16_x2(svuint16x2_t_val, svuint8_t_val, 1);
+}
+
+void test_streaming_compatible(void) __arm_streaming_compatible{
+  svbfloat16x2_t svbfloat16x2_t_val;
+  svfloat16x2_t svfloat16x2_t_val;
+  svint16x2_t svint16x2_t_val;
+  svuint8_t svuint8_t_val;
+  svuint16x2_t svuint16x2_t_val;
+
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane(svbfloat16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane(svfloat16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane(svint16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane(svuint16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_bf16_x2(svbfloat16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_f16_x2(svfloat16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_s16_x2(svint16x2_t_val, svuint8_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_u16_x2(svuint16x2_t_val, svuint8_t_val, 1);
+}
diff --git a/clang/test/Sema/AArch64/arm_sve_non_streaming_only_sve_AND_sve2p3.c b/clang/test/Sema/AArch64/arm_sve_non_streaming_only_sve_AND_sve2p3.c
new file mode 100644
index 0000000000000..ebf06311b8939
--- /dev/null
+++ b/clang/test/Sema/AArch64/arm_sve_non_streaming_only_sve_AND_sve2p3.c
@@ -0,0 +1,62 @@
+// NOTE: File has been autogenerated by utils/aarch64_builtins_test_generator.py
+// RUN: %clang_cc1 %s -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +sve2p3 -verify=guard
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+// Properties: guard="sve,sve2p3" streaming_guard="" flags=""
+
+void test(void) {
+  svint8x2_t svint8x2_t_val;
+  svmfloat8x2_t svmfloat8x2_t_val;
+  svuint8_t svuint8_t_val;
+  svuint8x2_t svuint8x2_t_val;
+
+  svluti6(svint8x2_t_val, svuint8_t_val);
+  svluti6(svmfloat8x2_t_val, svuint8_t_val);
+  svluti6(svuint8x2_t_val, svuint8_t_val);
+  svluti6_mf8_x2(svmfloat8x2_t_val, svuint8_t_val);
+  svluti6_s8_x2(svint8x2_t_val, svuint8_t_val);
+  svluti6_u8_x2(svuint8x2_t_val, svuint8_t_val);
+}
+
+void test_streaming(void) __arm_streaming{
+  svint8x2_t svint8x2_t_val;
+  svmfloat8x2_t svmfloat8x2_t_val;
+  svuint8_t svuint8_t_val;
+  svuint8x2_t svuint8x2_t_val;
+
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svluti6(svint8x2_t_val, svuint8_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svluti6(svmfloat8x2_t_val, svuint8_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svluti6(svuint8x2_t_val, svuint8_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svluti6_mf8_x2(svmfloat8x2_t_val, svuint8_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svluti6_s8_x2(svint8x2_t_val, svuint8_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svluti6_u8_x2(svuint8x2_t_val, svuint8_t_val);
+}
+
+void test_streaming_compatible(void) __arm_streaming_compatible{
+  svint8x2_t svint8x2_t_val;
+  svmfloat8x2_t svmfloat8x2_t_val;
+  svuint8_t svuint8_t_val;
+  svuint8x2_t svuint8x2_t_val;
+
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svluti6(svint8x2_t_val, svuint8_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svluti6(svmfloat8x2_t_val, svuint8_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svluti6(svuint8x2_t_val, svuint8_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svluti6_mf8_x2(svmfloat8x2_t_val, svuint8_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svluti6_s8_x2(svint8x2_t_val, svuint8_t_val);
+  // guard-error at +1 {{builtin can only be called from a non-streaming function}}
+  svluti6_u8_x2(svuint8x2_t_val, svuint8_t_val);
+}
diff --git a/clang/test/Sema/AArch64/arm_sve_streaming_only_sme_AND_sme2p3.c b/clang/test/Sema/AArch64/arm_sve_streaming_only_sme_AND_sme2p3.c
new file mode 100644
index 0000000000000..0f88ee7ad7fef
--- /dev/null
+++ b/clang/test/Sema/AArch64/arm_sve_streaming_only_sme_AND_sme2p3.c
@@ -0,0 +1,118 @@
+// NOTE: File has been autogenerated by utils/aarch64_builtins_test_generator.py
+// RUN: %clang_cc1 %s -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2p3 -target-feature +sve -verify=streaming-guard
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+// Properties: guard="" streaming_guard="sme,sme2p3" flags="streaming-only"
+
+void test(void) {
+  svbfloat16x2_t svbfloat16x2_t_val;
+  svfloat16x2_t svfloat16x2_t_val;
+  svint16x2_t svint16x2_t_val;
+  svuint8x2_t svuint8x2_t_val;
+  svuint8x3_t svuint8x3_t_val;
+  svuint16x2_t svuint16x2_t_val;
+
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_bf16_x4(svbfloat16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_bf16_x4(svbfloat16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_bf16_x4_bf16_x2_u8_x2(svbfloat16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_bf16_x4_bf16_x2_u8_x3(svbfloat16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_f16_x4(svfloat16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_f16_x4(svfloat16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_f16_x4_f16_x2_u8_x2(svfloat16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_f16_x4_f16_x2_u8_x3(svfloat16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_s16_x4(svint16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_s16_x4(svint16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_s16_x4_s16_x2_u8_x2(svint16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_s16_x4_s16_x2_u8_x3(svint16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_u16_x4(svuint16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_u16_x4(svuint16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_u16_x4_u16_x2_u8_x2(svuint16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_u16_x4_u16_x2_u8_x3(svuint16x2_t_val, svuint8x3_t_val, 1);
+}
+
+void test_streaming(void) __arm_streaming{
+  svbfloat16x2_t svbfloat16x2_t_val;
+  svfloat16x2_t svfloat16x2_t_val;
+  svint16x2_t svint16x2_t_val;
+  svuint8x2_t svuint8x2_t_val;
+  svuint8x3_t svuint8x3_t_val;
+  svuint16x2_t svuint16x2_t_val;
+
+  svluti6_lane_bf16_x4(svbfloat16x2_t_val, svuint8x2_t_val, 1);
+  svluti6_lane_bf16_x4(svbfloat16x2_t_val, svuint8x3_t_val, 1);
+  svluti6_lane_bf16_x4_bf16_x2_u8_x2(svbfloat16x2_t_val, svuint8x2_t_val, 1);
+  svluti6_lane_bf16_x4_bf16_x2_u8_x3(svbfloat16x2_t_val, svuint8x3_t_val, 1);
+  svluti6_lane_f16_x4(svfloat16x2_t_val, svuint8x2_t_val, 1);
+  svluti6_lane_f16_x4(svfloat16x2_t_val, svuint8x3_t_val, 1);
+  svluti6_lane_f16_x4_f16_x2_u8_x2(svfloat16x2_t_val, svuint8x2_t_val, 1);
+  svluti6_lane_f16_x4_f16_x2_u8_x3(svfloat16x2_t_val, svuint8x3_t_val, 1);
+  svluti6_lane_s16_x4(svint16x2_t_val, svuint8x2_t_val, 1);
+  svluti6_lane_s16_x4(svint16x2_t_val, svuint8x3_t_val, 1);
+  svluti6_lane_s16_x4_s16_x2_u8_x2(svint16x2_t_val, svuint8x2_t_val, 1);
+  svluti6_lane_s16_x4_s16_x2_u8_x3(svint16x2_t_val, svuint8x3_t_val, 1);
+  svluti6_lane_u16_x4(svuint16x2_t_val, svuint8x2_t_val, 1);
+  svluti6_lane_u16_x4(svuint16x2_t_val, svuint8x3_t_val, 1);
+  svluti6_lane_u16_x4_u16_x2_u8_x2(svuint16x2_t_val, svuint8x2_t_val, 1);
+  svluti6_lane_u16_x4_u16_x2_u8_x3(svuint16x2_t_val, svuint8x3_t_val, 1);
+}
+
+void test_streaming_compatible(void) __arm_streaming_compatible{
+  svbfloat16x2_t svbfloat16x2_t_val;
+  svfloat16x2_t svfloat16x2_t_val;
+  svint16x2_t svint16x2_t_val;
+  svuint8x2_t svuint8x2_t_val;
+  svuint8x3_t svuint8x3_t_val;
+  svuint16x2_t svuint16x2_t_val;
+
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_bf16_x4(svbfloat16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_bf16_x4(svbfloat16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_bf16_x4_bf16_x2_u8_x2(svbfloat16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_bf16_x4_bf16_x2_u8_x3(svbfloat16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_f16_x4(svfloat16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_f16_x4(svfloat16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_f16_x4_f16_x2_u8_x2(svfloat16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_f16_x4_f16_x2_u8_x3(svfloat16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_s16_x4(svint16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_s16_x4(svint16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_s16_x4_s16_x2_u8_x2(svint16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_s16_x4_s16_x2_u8_x3(svint16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_u16_x4(svuint16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_u16_x4(svuint16x2_t_val, svuint8x3_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_u16_x4_u16_x2_u8_x2(svuint16x2_t_val, svuint8x2_t_val, 1);
+  // streaming-guard-error at +1 {{builtin can only be called from a streaming function}}
+  svluti6_lane_u16_x4_u16_x2_u8_x3(svuint16x2_t_val, svuint8x3_t_val, 1);
+}

>From 72945f24636683d8bc99983cdac97039ec7a2dcd Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 4 Jun 2026 10:21:33 +0100
Subject: [PATCH 21/22] fixup! Reformat classes to make more sense, and other
 CR updates

---
 clang/include/clang/Basic/arm_sve.td          |  4 +-
 .../sme2p3-intrinsics/acle_sme2p3_luti6.c     | 40 +++++++++++---
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  3 +-
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 44 ++++++++-------
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  2 -
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  2 +-
 llvm/lib/Target/AArch64/SMEInstrFormats.td    | 45 ++++++++--------
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 54 +++++++++----------
 .../AArch64/sme2p3-intrinsics-luti6.ll        | 48 +++++++++++++++--
 9 files changed, 156 insertions(+), 86 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 5a67c241907c1..91111001703c3 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1928,8 +1928,8 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = "sve2p3|sme2p3" in {
 }
 
 let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2p3" in {
-  def SVLUTI6_X4_U8X2 : SInst<"svluti6_lane_{d}_x4[_{1}_x2_u8_x2]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
-  def SVLUTI6_X4_U8X3 : SInst<"svluti6_lane_{d}_x4[_{1}_x2_u8_x3]", "423.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4_x3", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI6_X4_U8X2 : SInst<"svluti6_lane_{d}_x4[_{d}_x2_u8_x2]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4_x2", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI6_X4_U8X3 : SInst<"svluti6_lane_{d}_x4[_{d}_x2_u8_x3]", "423.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4_x3", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
index b320ca6883d73..656b0ce565833 100644
--- a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
@@ -18,13 +18,13 @@
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_s16_x4(
 // CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
 // CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svluti6_lane_s16_x411svint16x2_t11svuint8x2_t(
 // CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
@@ -35,13 +35,13 @@ svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
 // CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_u16_x4(
 // CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
 // CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svluti6_lane_u16_x412svuint16x2_t11svuint8x2_t(
 // CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
 svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
@@ -52,13 +52,13 @@ svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
 // CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti6_lane_f16_x4(
 // CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
 // CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @_Z24test_svluti6_lane_f16_x413svfloat16x2_t11svuint8x2_t(
 // CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
 svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
@@ -69,13 +69,13 @@ svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
 // CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti6_lane_bf16_x4(
 // CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
 // CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z25test_svluti6_lane_bf16_x414svbfloat16x2_t11svuint8x2_t(
 // CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CPP-CHECK-NEXT:  [[ENTRY:.*:]]
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
 // CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
 svbfloat16x4_t test_svluti6_lane_bf16_x4(svbfloat16x2_t table, svuint8x2_t indices)
@@ -89,6 +89,12 @@ svbfloat16x4_t test_svluti6_lane_bf16_x4(svbfloat16x2_t table, svuint8x2_t indic
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]], i32 1)
 // CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z30test_svluti6_lane_s16_x4_u8_x311svint16x2_t11svuint8x3_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]], i32 1)
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
+//
 svint16x4_t test_svluti6_lane_s16_x4_u8_x3(svint16x2_t table, svuint8x3_t indices)
     __arm_streaming {
   return SVE_ACLE_FUNC(svluti6_lane,_s16_x4,_s16_x2_u8_x3,)(table, indices, 1);
@@ -100,6 +106,12 @@ svint16x4_t test_svluti6_lane_s16_x4_u8_x3(svint16x2_t table, svuint8x3_t indice
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]], i32 0)
 // CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
 //
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z30test_svluti6_lane_u16_x4_u8_x312svuint16x2_t11svuint8x3_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
+//
 svuint16x4_t test_svluti6_lane_u16_x4_u8_x3(svuint16x2_t table, svuint8x3_t indices)
     __arm_streaming {
   return SVE_ACLE_FUNC(svluti6_lane,_u16_x4,_u16_x2_u8_x3,)(table, indices, 0);
@@ -111,6 +123,12 @@ svuint16x4_t test_svluti6_lane_u16_x4_u8_x3(svuint16x2_t table, svuint8x3_t indi
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]], i32 1)
 // CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
 //
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @_Z30test_svluti6_lane_f16_x4_u8_x313svfloat16x2_t11svuint8x3_t(
+// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]], i32 1)
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
+//
 svfloat16x4_t test_svluti6_lane_f16_x4_u8_x3(svfloat16x2_t table, svuint8x3_t indices)
     __arm_streaming {
   return SVE_ACLE_FUNC(svluti6_lane,_f16_x4,_f16_x2_u8_x3,)(table, indices, 1);
@@ -122,6 +140,12 @@ svfloat16x4_t test_svluti6_lane_f16_x4_u8_x3(svfloat16x2_t table, svuint8x3_t in
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]], i32 0)
 // CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
 //
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z31test_svluti6_lane_bf16_x4_u8_x314svbfloat16x2_t11svuint8x3_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]], i32 0)
+// CPP-CHECK-NEXT:    ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
 svbfloat16x4_t test_svluti6_lane_bf16_x4_u8_x3(svbfloat16x2_t table, svuint8x3_t indices)
     __arm_streaming {
   return SVE_ACLE_FUNC(svluti6_lane,_bf16_x4,_bf16_x2_u8_x3,)(table, indices, 0);
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 1a56bab438f2d..010e06fae398f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -4015,7 +4015,7 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                             [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
                             [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrInaccessibleMemOnly, IntrReadMem]>;
-  def int_aarch64_sme_luti6_lane_x4
+  def int_aarch64_sme_luti6_lane_x4_x2
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                             [LLVMMatchType<0>, LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
                             [ImmArg<ArgIndex<4>>, IntrNoMem, IntrSpeculatable]>;
@@ -4349,4 +4349,3 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_pmlal_pair_x2 : DefaultAttrsIntrinsic<[llvm_nxv2i64_ty, llvm_nxv2i64_ty],
       [llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty], [IntrNoMem]>;
 }
-
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 09a9e3e5cc558..3337eab30f553 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -415,8 +415,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
 
   void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
                                  unsigned Opc, uint32_t MaxImm);
-  void SelectMultiVectorLuti6LaneX4(SDNode *Node, unsigned Opc,
-                                    unsigned NumIndexVecs);
+  void SelectMultiVectorLuti6LaneX4(SDNode *Node, unsigned NumIndexVecs);
 
   void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
                              unsigned NumInVecs);
@@ -2278,21 +2277,34 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
 }
 
 void AArch64DAGToDAGISel::SelectMultiVectorLuti6LaneX4(SDNode *Node,
-                                                       unsigned Opc,
                                                        unsigned NumIndexVecs) {
-  unsigned ImmOp = 3 + NumIndexVecs;
+  assert((NumIndexVecs == 2 || NumIndexVecs == 3) &&
+         "unexpected number of index vectors");
+
+  constexpr unsigned FirstIndexOp = 3;
+  unsigned ImmOp = FirstIndexOp + NumIndexVecs;
   auto *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(ImmOp));
-  if (Imm && Imm->getZExtValue() > 1)
+  if (!Imm || Imm->getZExtValue() > 1)
     return;
 
-  SmallVector<SDValue, 3> IndexRegs(Node->ops().slice(3, NumIndexVecs));
-  SDValue Ops[] = {createZTuple({Node->getOperand(1), Node->getOperand(2)}),
-                   createZTuple(IndexRegs), Node->getOperand(ImmOp)};
+  // The luti6 instruction always takes a 2-register Zm index tuple. The x3
+  // ACLE form provides three index vectors, so the lane selects which adjacent
+  // pair to use before forming Zm (op 3/4 or op 4/5, with op6 as imm)
+  unsigned Lane = Imm->getZExtValue();
+  unsigned IndexOp = FirstIndexOp;
+  if (NumIndexVecs == 3)
+    IndexOp += Lane;
+
+  SDValue TableTuple = createZTuple({Node->getOperand(1), Node->getOperand(2)});
+  SDValue IndexTuple =
+      createZTuple({Node->getOperand(IndexOp), Node->getOperand(IndexOp + 1)});
+  SDValue Ops[] = {TableTuple, IndexTuple, Node->getOperand(ImmOp)};
 
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
-  SDNode *Instruction = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops);
-  SDValue SuperReg(Instruction, 0);
+  SDNode *Instruction =
+      CurDAG->getMachineNode(AArch64::LUTI6_4Z2Z2ZI, DL, MVT::Untyped, Ops);
+  SDValue SuperReg = SDValue(Instruction, 0);
 
   for (unsigned I = 0; I < 4; ++I)
     ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
@@ -2318,7 +2330,7 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
 
   SDNode *Instruction =
       CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
-  SDValue SuperReg(Instruction, 0);
+  SDValue SuperReg = SDValue(Instruction, 0);
 
   for (unsigned I = 0; I < NumOutVecs; ++I)
     ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
@@ -6112,15 +6124,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                AArch64::SRSHL_VG4_4ZZ_S, AArch64::SRSHL_VG4_4ZZ_D}))
         SelectDestructiveMultiIntrinsic(Node, 4, false, Op);
       return;
-    case Intrinsic::aarch64_sme_luti6_lane_x4:
-      if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
-              Node->getValueType(0), {0, AArch64::LUTI6_4Z2Z2ZI, 0}))
-        SelectMultiVectorLuti6LaneX4(Node, Opc, 2);
+    case Intrinsic::aarch64_sme_luti6_lane_x4_x2:
+      SelectMultiVectorLuti6LaneX4(Node, 2);
       return;
     case Intrinsic::aarch64_sme_luti6_lane_x4_x3:
-      if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
-              Node->getValueType(0), {0, AArch64::LUTI6_4Z2Z3ZI, 0}))
-        SelectMultiVectorLuti6LaneX4(Node, Opc, 3);
+      SelectMultiVectorLuti6LaneX4(Node, 3);
       return;
     case Intrinsic::aarch64_sve_urshl_single_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 75a5d7f640c8b..d0eb9ca218a27 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -1144,7 +1144,5 @@ let Predicates = [HasSME2p3] in {
   def LUTI6_4ZT3Z     : sme2_luti6_zt_consecutive<"luti6">;
   def LUTI6_S_4ZT3Z   : sme2_luti6_zt_strided<"luti6">;
   def LUTI6_4Z2Z2ZI   : sme2_luti6_vector_vg4_consecutive<"luti6">;
-  let isCodeGenOnly = 1 in
-  def LUTI6_4Z2Z3ZI   : sme2_luti6_vector_vg4_consecutive_x3<"luti6">;
   def LUTI6_S_4Z2Z2ZI : sme2_luti6_vector_vg4_strided<"luti6">;
 } // [HasSME2p3]
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4548ea13a6b0d..4712406e37e6b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4897,7 +4897,7 @@ let Predicates = [HasSVE2p3_or_SME2p3] in {
   defm SQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"sqshrn",   0b000, int_aarch64_sve_sqshrn_x2>;
   defm UQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"uqshrn",   0b010, int_aarch64_sve_uqshrn_x2>;
 
-  defm LUTI6_Z2ZZI        : sve2_luti6_vector_index<"luti6", int_aarch64_sve_luti6_lane_x2>;
+  defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6", int_aarch64_sve_luti6_lane_x2>;
 } // End HasSME2p3orSVE2p3
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 78012790bd0d9..525e97a880e4f 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -3921,19 +3921,26 @@ multiclass sme2_luti4_vector_vg4_index<string mnemonic> {
 }
 
 // 8-bit Look up table
-multiclass sme2_lut_single<string asm, SDPatternOperator intrinsic> {
-  def NAME : I<(outs ZPR8:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn),
-                asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> {
-    bits<0> ZTt;
-    bits<5> Zd;
-    bits<5> Zn;
-    let Inst{31-10} = 0b1100000011001000010000;
-    let Inst{9-5}   = Zn;
-    let Inst{4-0}   = Zd;
-  }
+class sme2_lut_single<string asm>
+    : I<(outs ZPR8:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn),
+        asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> {
+  bits<0> ZTt;
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-10} = 0b1100000011001000010000;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
 
+multiclass sme2_lut_single_pat<Instruction inst, SDPatternOperator intrinsic> {
   def : Pat<(nxv16i8 (intrinsic (imm_to_zt untyped:$zt), nxv16i8:$zn)),
-            (!cast<Instruction>(NAME) $zt, nxv16i8:$zn)>;
+            (inst $zt, nxv16i8:$zn)>;
+}
+
+multiclass sme2_lut_single<string asm, SDPatternOperator intrinsic> {
+  def NAME : sme2_lut_single<asm>;
+
+  defm : sme2_lut_single_pat<!cast<Instruction>(NAME), intrinsic>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3967,9 +3974,8 @@ class sme2_luti6_zt_strided<string asm>
 
 //===----------------------------------------------------------------------===//
 // Lookup table read with 6-bit indices (8-bit)
-class sme2_luti6_vector_vg4_base<RegisterOperand zd_ty,
-                                 RegisterOperand zm_ty, string asm>
-  : I<(outs zd_ty:$Zd), (ins ZZ_h:$Zn, zm_ty:$Zm, VectorIndexD:$i1),
+class sme2_luti6_vector_vg4_base<RegisterOperand zd_ty, string asm>
+  : I<(outs zd_ty:$Zd), (ins ZZ_h:$Zn, ZZ_Any:$Zm, VectorIndexD:$i1),
     asm, "\t$Zd, $Zn, $Zm$i1", "", []>, Sched<[]> {
   bits<3> Zd;
   bits<5> Zn;
@@ -3983,21 +3989,14 @@ class sme2_luti6_vector_vg4_base<RegisterOperand zd_ty,
 }
 
 class sme2_luti6_vector_vg4_consecutive<string asm>
-  : sme2_luti6_vector_vg4_base<ZZZZ_h_mul_r, ZZ_Any, asm> {
-  let Inst{15-10} = 0b111101;
-  let Inst{4-2}   = Zd;
-  let Inst{1-0}   = 0b00;
-}
-
-class sme2_luti6_vector_vg4_consecutive_x3<string asm>
-  : sme2_luti6_vector_vg4_base<ZZZZ_h_mul_r, ZZZ_Any, asm> {
+  : sme2_luti6_vector_vg4_base<ZZZZ_h_mul_r, asm> {
   let Inst{15-10} = 0b111101;
   let Inst{4-2}   = Zd;
   let Inst{1-0}   = 0b00;
 }
 
 class sme2_luti6_vector_vg4_strided<string asm>
-  : sme2_luti6_vector_vg4_base<ZZZZ_h_strided, ZZ_Any, asm> {
+  : sme2_luti6_vector_vg4_base<ZZZZ_h_strided, asm> {
   let Inst{15-10} = 0b111111;
   let Inst{4}     = Zd{2};
   let Inst{3-2}   = 0b00;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 822c38569c1cd..7fd2df5d7da7a 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11414,6 +11414,13 @@ multiclass sve2_luti4_vector_vg2_index<string mnemonic> {
                                                 nxv16i8:$Op3, timm32_0_3:$Op4))>;
 }
 
+class sve2_luti6_vector_index_pat<ValueType vt, SDPatternOperator intrinsic,
+                                  Instruction inst>
+    : Pat<(vt (intrinsic vt:$Op1, vt:$Op2, nxv16i8:$Op3,
+                         (i32 timm32_0_1:$Op4))),
+          (vt (inst (REG_SEQUENCE ZPR2, vt:$Op1, zsub0, vt:$Op2, zsub1),
+                    nxv16i8:$Op3, timm32_0_1:$Op4))>;
+
 // Look up table read with 6-bit indices
 multiclass sve2_luti6_vector_index<string mnemonic, SDPatternOperator intrinsic> {
   def _H : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexD32b, 0b1011, mnemonic> {
@@ -11421,37 +11428,30 @@ multiclass sve2_luti6_vector_index<string mnemonic, SDPatternOperator intrinsic>
     let Inst{23} = idx;
   }
 
-  def : Pat<(nxv8i16 (intrinsic nxv8i16:$Op1, nxv8i16:$Op2,
-                      nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
-            (nxv8i16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
-                                                                      nxv8i16:$Op2, zsub1),
-                                                nxv16i8:$Op3, timm32_0_1:$Op4))>;
-  def : Pat<(nxv8f16 (intrinsic nxv8f16:$Op1, nxv8f16:$Op2,
-                      nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
-            (nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
-                                                                      nxv8f16:$Op2, zsub1),
-                                                nxv16i8:$Op3, timm32_0_1:$Op4))>;
-  def : Pat<(nxv8bf16 (intrinsic nxv8bf16:$Op1, nxv8bf16:$Op2,
-                       nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
-            (nxv8bf16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0,
-                                                                        nxv8bf16:$Op2, zsub1),
-                                                 nxv16i8:$Op3, timm32_0_1:$Op4))>;
+  def : sve2_luti6_vector_index_pat<nxv8i16, intrinsic,
+                                    !cast<Instruction>(NAME # _H)>;
+  def : sve2_luti6_vector_index_pat<nxv8f16, intrinsic,
+                                    !cast<Instruction>(NAME # _H)>;
+  def : sve2_luti6_vector_index_pat<nxv8bf16, intrinsic,
+                                    !cast<Instruction>(NAME # _H)>;
 }
 
 // Look up table
+class sve2_luti6_vector<string mnemonic>
+    : I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, ZPRAny:$Zm),
+        mnemonic, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-21} = 0b01000101001;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b101011;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
 multiclass sve2_luti6_vector<string mnemonic, SDPatternOperator intrinsic> {
-  def NAME : I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, ZPRAny:$Zm),
-                mnemonic, "\t$Zd, $Zn, $Zm",
-                "", []>, Sched<[]> {
-    bits<5> Zd;
-    bits<5> Zn;
-    bits<5> Zm;
-    let Inst{31-21} = 0b01000101001;
-    let Inst{20-16} = Zm;
-    let Inst{15-10} = 0b101011;
-    let Inst{9-5}   = Zn;
-    let Inst{4-0}   = Zd;
-  }
+  def NAME : sve2_luti6_vector<mnemonic>;
 
   def : Pat<(nxv16i8 (intrinsic nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)),
             (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0,
diff --git a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
index 8cf13f7f0cd71..3c695e46267d3 100644
--- a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
@@ -27,7 +27,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
 ; CHECK-NEXT:    mov z4.d, z0.d
 ; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z3.h, z4.h }, { z1, z2 }[1]
 ; CHECK-NEXT:    ret
-  %res = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
+  %res = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x2.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
@@ -39,7 +39,7 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <v
 ; CHECK-NEXT:    mov z4.d, z0.d
 ; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z3.h, z4.h }, { z1, z2 }[0]
 ; CHECK-NEXT:    ret
-  %res = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 0)
+  %res = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.x2.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 0)
   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
 }
 
@@ -51,6 +51,48 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
 ; CHECK-NEXT:    mov z4.d, z0.d
 ; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z3.h, z4.h }, { z1, z2 }[1]
 ; CHECK-NEXT:    ret
-  %res = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
+  %res = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.x2.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 }
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @luti6_i16_x4_x3_imm0(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d) {
+; CHECK-LABEL: luti6_i16_x4_x3_imm0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z3.h, z4.h }, { z1, z2 }[0]
+; CHECK-NEXT:    ret
+  %res = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, i32 0)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @luti6_i16_x4_x3_imm1(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d) {
+; CHECK-LABEL: luti6_i16_x4_x3_imm1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[1]
+; CHECK-NEXT:    ret
+  %res = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, i32 1)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @luti6_f16_x4_x3(<vscale x 8 x half> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d) {
+; CHECK-LABEL: luti6_f16_x4_x3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[1]
+; CHECK-NEXT:    ret
+  %res = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, i32 1)
+  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @luti6_bf16_x4_x3(<vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d) {
+; CHECK-LABEL: luti6_bf16_x4_x3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    luti6 { z0.h - z3.h }, { z3.h, z4.h }, { z1, z2 }[0]
+; CHECK-NEXT:    ret
+  %res = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.x3.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, i32 0)
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}

>From b68444e381aaec4271f38dcb82cceb408098e6e6 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Wed, 10 Jun 2026 11:44:46 +0100
Subject: [PATCH 22/22] fixup! Tighten code some more

---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  3 +++
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |  8 ++----
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 26 +++++++++----------
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 3337eab30f553..0a4a0ffed53bf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2317,6 +2317,9 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
                                                 unsigned NumOutVecs,
                                                 unsigned Opc,
                                                 unsigned NumInVecs) {
+  assert((NumInVecs == 2 || NumInVecs == 3) &&
+         "unexpected number of input vectors");
+
   SDValue ZtValue;
   if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
     return;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 525e97a880e4f..f07fb8ad81f63 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -3932,15 +3932,11 @@ class sme2_lut_single<string asm>
   let Inst{4-0}   = Zd;
 }
 
-multiclass sme2_lut_single_pat<Instruction inst, SDPatternOperator intrinsic> {
-  def : Pat<(nxv16i8 (intrinsic (imm_to_zt untyped:$zt), nxv16i8:$zn)),
-            (inst $zt, nxv16i8:$zn)>;
-}
-
 multiclass sme2_lut_single<string asm, SDPatternOperator intrinsic> {
   def NAME : sme2_lut_single<asm>;
 
-  defm : sme2_lut_single_pat<!cast<Instruction>(NAME), intrinsic>;
+  def : Pat<(nxv16i8 (intrinsic (imm_to_zt untyped:$zt), nxv16i8:$zn)),
+            (!cast<Instruction>(NAME) $zt, nxv16i8:$zn)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 7fd2df5d7da7a..f96702a01c277 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -728,6 +728,13 @@ class SVE_Cvt_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt
     : Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2)),
                   (!cast<Instruction>(name) (REG_SEQUENCE ZPR2Mul2, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1))>;
 
+class SVE_LUTI6_VG2_Index_Pat<ValueType vt, SDPatternOperator intrinsic,
+                              Instruction inst>
+    : Pat<(vt (intrinsic vt:$Op1, vt:$Op2, nxv16i8:$Op3,
+                         (i32 timm32_0_1:$Op4))),
+          (vt (inst (REG_SEQUENCE ZPR2, vt:$Op1, zsub0, vt:$Op2, zsub1),
+                    nxv16i8:$Op3, timm32_0_1:$Op4))>;
+
 //===----------------------------------------------------------------------===//
 // SVE pattern match helpers.
 //===----------------------------------------------------------------------===//
@@ -11414,13 +11421,6 @@ multiclass sve2_luti4_vector_vg2_index<string mnemonic> {
                                                 nxv16i8:$Op3, timm32_0_3:$Op4))>;
 }
 
-class sve2_luti6_vector_index_pat<ValueType vt, SDPatternOperator intrinsic,
-                                  Instruction inst>
-    : Pat<(vt (intrinsic vt:$Op1, vt:$Op2, nxv16i8:$Op3,
-                         (i32 timm32_0_1:$Op4))),
-          (vt (inst (REG_SEQUENCE ZPR2, vt:$Op1, zsub0, vt:$Op2, zsub1),
-                    nxv16i8:$Op3, timm32_0_1:$Op4))>;
-
 // Look up table read with 6-bit indices
 multiclass sve2_luti6_vector_index<string mnemonic, SDPatternOperator intrinsic> {
   def _H : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexD32b, 0b1011, mnemonic> {
@@ -11428,12 +11428,12 @@ multiclass sve2_luti6_vector_index<string mnemonic, SDPatternOperator intrinsic>
     let Inst{23} = idx;
   }
 
-  def : sve2_luti6_vector_index_pat<nxv8i16, intrinsic,
-                                    !cast<Instruction>(NAME # _H)>;
-  def : sve2_luti6_vector_index_pat<nxv8f16, intrinsic,
-                                    !cast<Instruction>(NAME # _H)>;
-  def : sve2_luti6_vector_index_pat<nxv8bf16, intrinsic,
-                                    !cast<Instruction>(NAME # _H)>;
+  def : SVE_LUTI6_VG2_Index_Pat<nxv8i16, intrinsic,
+                                !cast<Instruction>(NAME # _H)>;
+  def : SVE_LUTI6_VG2_Index_Pat<nxv8f16, intrinsic,
+                                !cast<Instruction>(NAME # _H)>;
+  def : SVE_LUTI6_VG2_Index_Pat<nxv8bf16, intrinsic,
+                                !cast<Instruction>(NAME # _H)>;
 }
 
 // Look up table