[clang] [llvm] [AArch64][clang][llvm] Add support for Armv9.7-A lookup table intrinsics (PR #187046)
Jonathan Thackray via cfe-commits
cfe-commits at lists.llvm.org
Tue Apr 14 05:34:39 PDT 2026
https://github.com/jthackray updated https://github.com/llvm/llvm-project/pull/187046
>From 5eaea6ac9cc04a4895648714c87964246ab98f78 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Fri, 13 Mar 2026 15:35:37 +0000
Subject: [PATCH 01/14] [AArch64][clang][llvm] Add support for Armv9.7-A lookup
table intrinsics
Add support for the following Armv9.7-A Lookup Table (lut)
instruction intrinsics:
SVE2.3
```c
// Variant is also available for: _u8 _mf8
svint8_t svluti6[_s8](svint8x2_t table, svuint8_t indices);
```
SVE2.3 and SME2.3
``` c
// Variants are also available for _u16_x2 and _f16_x2.
svint16_t svluti6_lane[_s16_x2](svint16x2_t table, svuint8_t indices, uint64_t imm_idx);
```
SME2.3
```c
// Variant are also available for: _u16, _f16 and _bf16.
svint16x4_t svluti6_lane_s16_x4[_s16_x2](svint16x2_t table, svuint8x2_t indices, uint64_t imm_idx);
// Variants are also available for: _u8 and _mf8.
svint8x4_t svluti6_zt_s8_x4(uint64_t zt0, svuint8x3_t zn) __arm_streaming __arm_in("zt0");
// Variants are also available for: _u8 and _mf8.
svint8_t svluti6_zt_s8(uint64_t zt0, svuint8_t zn) __arm_streaming __arm_in("zt0");
```
---
clang/include/clang/Basic/arm_sme.td | 6 +
clang/include/clang/Basic/arm_sve.td | 9 +
clang/lib/Basic/Targets/AArch64.cpp | 29 +++
clang/lib/Basic/Targets/AArch64.h | 2 +
.../sme2p3-intrinsics/acle_sme2p3_luti6.c | 175 ++++++++++++++++++
.../sve2p3-intrinsics/acle_sve2p3_luti6.c | 112 +++++++++++
.../Preprocessor/aarch64-target-features.c | 23 +++
.../acle_sme2p3_imm.c | 21 +++
.../acle_sme2p3_target.c | 20 ++
.../acle_sme2p3_target_lane.c | 16 ++
.../acle_sve2p3_imm.cpp | 24 +++
.../acle_sve2p3_target.c | 19 ++
.../acle_sve2p3_target_lane.c | 14 ++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 32 ++++
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 102 ++++++++++
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 10 +-
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 3 +
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 12 +-
llvm/lib/Target/AArch64/SVEInstrFormats.td | 11 ++
.../AArch64/sme2p3-intrinsics-luti6.ll | 105 +++++++++++
.../AArch64/sve2p3-intrinsics-luti6.ll | 55 ++++++
.../test/Verifier/AArch64/luti6-intrinsics.ll | 79 ++++++++
22 files changed, 876 insertions(+), 3 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
create mode 100644 clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
create mode 100644 clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c
create mode 100644 clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
create mode 100644 clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
create mode 100644 clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp
create mode 100644 clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c
create mode 100644 clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
create mode 100644 llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
create mode 100644 llvm/test/Verifier/AArch64/luti6-intrinsics.ll
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 032c588966032..8de360fca5f5e 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -981,6 +981,12 @@ let SMETargetGuard = "sme-lutv2" in {
def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
}
+let SMETargetGuard = "sme2p3" in {
+ def SVLUTI6_ZT : SInst<"svluti6_zt_{d}", "diu", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
+ def SVLUTI6_ZT_X4 : SInst<"svluti6_zt_{d}_x4", "4i3.u", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
+ def SVLUTI6_LANE_X4 : SInst<"svluti6_lane[_{d}_x4]", "42.d2.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
+}
+
let SMETargetGuard = "sme-f8f32" in {
def SVMOPA_FP8_ZA32 : Inst<"svmopa_za32[_mf8]_m", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za32",
[IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<0, ImmCheck0_3>]>;
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 724802cce24f7..555d334638763 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1876,6 +1876,15 @@ let SVETargetGuard = "(sve2|sme2),lut", SMETargetGuard = "sme2,lut" in {
def SVLUTI4_x2 : SInst<"svluti4_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti4_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>;
}
+let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
+ def SVLUTI6 : SInst<"svluti6[_{d}]", "d2u", "cUcm", MergeNone, "aarch64_sve_luti6", [IsOverloadNone]>;
+}
+
+let SVETargetGuard = "sve2p3", SMETargetGuard = "sme2p3" in {
+ def SVLUTI6_x2_I16 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUs", MergeNone, "aarch64_sve_luti6_lane_x2_i16", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+ def SVLUTI6_x2_F16 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "h", MergeNone, "aarch64_sve_luti6_lane_x2_f16", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+}
+
////////////////////////////////////////////////////////////////////////////////
// SVE2 - Optional
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 9b951e69cce33..a79075622ce1d 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -500,6 +500,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
if (HasSVE2p1)
Builder.defineMacro("__ARM_FEATURE_SVE2p1", "1");
+ if (HasSVE2p3)
+ Builder.defineMacro("__ARM_FEATURE_SVE2p3", "1");
+
if (HasSVE2 && HasSVEAES)
Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1");
@@ -526,6 +529,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
if (HasSME2p1)
Builder.defineMacro("__ARM_FEATURE_SME2p1", "1");
+ if (HasSME2p3)
+ Builder.defineMacro("__ARM_FEATURE_SME2p3", "1");
+
if (HasSMEF16F16)
Builder.defineMacro("__ARM_FEATURE_SME_F16F16", "1");
@@ -907,9 +913,11 @@ void AArch64TargetInfo::computeFeatureLookup() {
.Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3)
.Case("sve2-sm4", FPU & SveMode && HasSVE2SM4)
.Case("sve2p1", FPU & SveMode && HasSVE2p1)
+ .Case("sve2p3", FPU & SveMode && HasSVE2p3)
.Case("sme", HasSME)
.Case("sme2", HasSME2)
.Case("sme2p1", HasSME2p1)
+ .Case("sme2p3", HasSME2p3)
.Case("sme-f64f64", HasSMEF64F64)
.Case("sme-i16i64", HasSMEI16I64)
.Case("sme-fa64", HasSMEFA64)
@@ -1015,6 +1023,15 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasSVE2 = true;
HasSVE2p1 = true;
}
+ if (Feature == "+sve2p3") {
+ FPU |= NeonMode;
+ FPU |= SveMode;
+ HasFullFP16 = true;
+ HasSVE2 = true;
+ HasSVE2p1 = true;
+ HasSVE2p2 = true;
+ HasSVE2p3 = true;
+ }
if (Feature == "+sve-aes") {
FPU |= NeonMode;
HasFullFP16 = true;
@@ -1071,6 +1088,18 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasBFloat16 = true;
HasFullFP16 = true;
}
+ if (Feature == "+sme2p3") {
+ HasSME = true;
+ HasSME2 = true;
+ HasSVE2 = true;
+ HasSVE2p1 = true;
+ HasSVE2p2 = true;
+ HasSME2p1 = true;
+ HasSME2p2 = true;
+ HasSME2p3 = true;
+ HasBFloat16 = true;
+ HasFullFP16 = true;
+ }
if (Feature == "+sme-f64f64") {
HasSME = true;
HasSMEF64F64 = true;
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 0a29bad81939b..b3c722a7f6d74 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -86,6 +86,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
bool HasBFloat16 = false;
bool HasSVE2 = false;
bool HasSVE2p1 = false;
+ bool HasSVE2p3 = false;
bool HasSVEAES = false;
bool HasSVE2SHA3 = false;
bool HasSVE2SM4 = false;
@@ -111,6 +112,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
bool HasSMEF16F16 = false;
bool HasSMEB16B16 = false;
bool HasSME2p1 = false;
+ bool HasSME2p3 = false;
bool HasFP8 = false;
bool HasFP8FMA = false;
bool HasFP8DOT2 = false;
diff --git a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
new file mode 100644
index 0000000000000..ae5fb1f64d0fc
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
@@ -0,0 +1,175 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sme.h>
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_s16_x4(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
+// CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svluti6_lane_s16_x411svint16x2_t11svuint8x2_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
+// CPP-CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
+//
+svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
+ __arm_streaming {
+ return svluti6_lane_s16_x4(table, indices, 1);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_u16_x4(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
+// CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svluti6_lane_u16_x412svuint16x2_t11svuint8x2_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
+// CPP-CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
+//
+svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
+ __arm_streaming {
+ return svluti6_lane_u16_x4(table, indices, 0);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti6_lane_f16_x4(
+// CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
+// CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @_Z24test_svluti6_lane_f16_x413svfloat16x2_t11svuint8x2_t(
+// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1)
+// CPP-CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
+//
+svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
+ __arm_streaming {
+ return svluti6_lane_f16_x4(table, indices, 1);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti6_lane_bf16_x4(
+// CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
+// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z25test_svluti6_lane_bf16_x414svbfloat16x2_t11svuint8x2_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0)
+// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x4_t test_svluti6_lane_bf16_x4(svbfloat16x2_t table, svuint8x2_t indices)
+ __arm_streaming {
+ return svluti6_lane_bf16_x4(table, indices, 0);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_s8(
+// CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z18test_svluti6_zt_s8u11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svluti6_zt_s8(svuint8_t indices) __arm_streaming __arm_in("zt0") {
+ return svluti6_zt_s8(0, indices);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_u8(
+// CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z18test_svluti6_zt_u8u11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svluti6_zt_u8(svuint8_t indices) __arm_streaming __arm_in("zt0") {
+ return svluti6_zt_u8(0, indices);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z19test_svluti6_zt_mf8u11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svluti6_zt_mf8(svuint8_t indices) __arm_streaming __arm_in("zt0") {
+ return svluti6_zt_mf8(0, indices);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svluti6_zt_u8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]])
+// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z21test_svluti6_zt_u8_x411svuint8x3_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]])
+// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svuint8x4_t test_svluti6_zt_u8_x4(svuint8x3_t indices)
+ __arm_streaming __arm_in("zt0") {
+ return svluti6_zt_u8_x4(0, indices);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svluti6_zt_s8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]])
+// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z21test_svluti6_zt_s8_x411svuint8x3_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]])
+// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svint8x4_t test_svluti6_zt_s8_x4(svuint8x3_t indices)
+ __arm_streaming __arm_in("zt0") {
+ return svluti6_zt_s8_x4(0, indices);
+}
+
+// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svluti6_zt_mf8_x4(
+// CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]])
+// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z22test_svluti6_zt_mf8_x411svuint8x3_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]])
+// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
+//
+svmfloat8x4_t test_svluti6_zt_mf8_x4(svuint8x3_t indices)
+ __arm_streaming __arm_in("zt0") {
+ return svluti6_zt_mf8_x4(0, indices);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
new file mode 100644
index 0000000000000..a806ef0b13c20
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
@@ -0,0 +1,112 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1, A2) A1##A2
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8(
+// CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_s810svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svluti6_s8(svint8x2_t table, svuint8_t indices) {
+ return SVE_ACLE_FUNC(svluti6, _s8)(table, indices);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_u8(
+// CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_u811svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svluti6_u8(svuint8x2_t table, svuint8_t indices) {
+ return SVE_ACLE_FUNC(svluti6, _u8)(table, indices);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svluti6_mf813svmfloat8x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svluti6_mf8(svmfloat8x2_t table, svuint8_t indices) {
+ return SVE_ACLE_FUNC(svluti6, _mf8)(table, indices);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) {
+ return SVE_ACLE_FUNC(svluti6_lane, _s16_x2)(table, indices, 1);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) {
+ return SVE_ACLE_FUNC(svluti6_lane, _u16_x2)(table, indices, 0);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
+// CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) {
+ return SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(table, indices, 1);
+}
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 60ddaad639d48..6316b25befed8 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -827,9 +827,32 @@
// CHECK-SVE2p2: __ARM_NEON_FP 0xE
// CHECK-SVE2p2: __ARM_NEON_SVE_BRIDGE 1
//
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv9.7-a+sve2p3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2p3 %s
+// CHECK-SVE2p3: __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
+// CHECK-SVE2p3: __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
+// CHECK-SVE2p3: __ARM_FEATURE_SVE 1
+// CHECK-SVE2p3: __ARM_FEATURE_SVE2 1
+// CHECK-SVE2p3: __ARM_FEATURE_SVE2p1 1
+// CHECK-SVE2p3: __ARM_FEATURE_SVE2p2 1
+// CHECK-SVE2p3: __ARM_FEATURE_SVE2p3 1
+// CHECK-SVE2p3: __ARM_NEON 1
+// CHECK-SVE2p3: __ARM_NEON_FP 0xE
+// CHECK-SVE2p3: __ARM_NEON_SVE_BRIDGE 1
+// CHECK-SVE2p3-NOT: __ARM_FEATURE_SME2p3 1
+//
// RUN: %clang --target=aarch64 -march=armv9-a+sme2p2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p2 %s
// CHECK-SME2p2: __ARM_FEATURE_LOCALLY_STREAMING 1
// CHECK-SME2p2: __ARM_FEATURE_SME 1
// CHECK-SME2p2: __ARM_FEATURE_SME2 1
// CHECK-SME2p2: __ARM_FEATURE_SME2p1 1
// CHECK-SME2p2: __ARM_FEATURE_SME2p2 1
+//
+// RUN: %clang --target=aarch64 -march=armv9.7-a+sme2p3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p3 %s
+// CHECK-SME2p3: __ARM_FEATURE_LOCALLY_STREAMING 1
+// CHECK-SME2p3: __ARM_FEATURE_SME 1
+// CHECK-SME2p3: __ARM_FEATURE_SME2 1
+// CHECK-SME2p3: __ARM_FEATURE_SME2p1 1
+// CHECK-SME2p3: __ARM_FEATURE_SME2p2 1
+// CHECK-SME2p3: __ARM_FEATURE_SME2p3 1
+// CHECK-SME2p3: __ARM_FEATURE_SVE2p1 1
+// CHECK-SME2p3: __ARM_FEATURE_SVE2p2 1
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c
new file mode 100644
index 0000000000000..8883ea3580fb2
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c
@@ -0,0 +1,21 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -fsyntax-only -verify %s
+
+#include <arm_sme.h>
+
+void test_range_0_0(void) __arm_streaming __arm_in("zt0") {
+ svluti6_zt_s8(1, svundef_u8()); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
+ svluti6_zt_u8_x4(1, svcreate3_u8(svundef_u8(), svundef_u8(), svundef_u8())); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
+}
+
+void test_range_0_1(void) __arm_streaming {
+ svluti6_lane_s16_x4(svcreate2_s16(svundef_s16(), svundef_s16()), // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+ svcreate2_u8(svundef_u8(), svundef_u8()), -1);
+ svluti6_lane_u16_x4(svcreate2_u16(svundef_u16(), svundef_u16()), // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+ svcreate2_u8(svundef_u8(), svundef_u8()), 2);
+ svluti6_lane_f16_x4(svcreate2_f16(svundef_f16(), svundef_f16()), // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+ svcreate2_u8(svundef_u8(), svundef_u8()), -1);
+ svluti6_lane_bf16_x4(svcreate2_bf16(svundef_bf16(), svundef_bf16()), // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+ svcreate2_u8(svundef_u8(), svundef_u8()), 2);
+}
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
new file mode 100644
index 0000000000000..2cffc1344bfe1
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
@@ -0,0 +1,20 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +bf16 -verify -emit-llvm -o - %s
+
+#include <arm_sme.h>
+
+svint8_t missing_sme2p3_zt(svuint8_t indices) __arm_streaming __arm_in("zt0") {
+ return svluti6_zt_s8(0, indices); // expected-error {{'svluti6_zt_s8' needs target feature sme,sme2p3}}
+}
+
+__attribute__((target("sme2p3")))
+svint8_t has_sme2p3_zt(svuint8_t indices) __arm_streaming __arm_in("zt0") {
+ return svluti6_zt_s8(0, indices);
+}
+
+__attribute__((target("sme2p3")))
+svfloat16_t has_sme2p3_implied_sme2p2(svbool_t pg, svfloat16_t op)
+ __arm_streaming {
+ return svcompact_f16(pg, op);
+}
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
new file mode 100644
index 0000000000000..1a06663a9aab7
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
@@ -0,0 +1,16 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +bf16 -verify -emit-llvm -o - %s
+
+#include <arm_sme.h>
+
+svbfloat16x4_t missing_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
+ __arm_streaming {
+ return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature sme,sme2p3}}
+}
+
+__attribute__((target("sme2p3,bf16")))
+svbfloat16x4_t has_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
+ __arm_streaming {
+ return svluti6_lane_bf16_x4(table, indices, 0);
+}
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp
new file mode 100644
index 0000000000000..8bbb0211b0bbb
--- /dev/null
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp
@@ -0,0 +1,24 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -fsyntax-only -verify %s
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1, A2) A1##A2
+#endif
+
+#include <arm_sve.h>
+
+void test_range_0_1() {
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+ SVE_ACLE_FUNC(svluti6_lane, _s16_x2)(svcreate2_s16(svundef_s16(), svundef_s16()),
+ svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+ SVE_ACLE_FUNC(svluti6_lane, _u16_x2)(svcreate2_u16(svundef_u16(), svundef_u16()),
+ svundef_u8(), 2);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+ SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(svcreate2_f16(svundef_f16(), svundef_f16()),
+ svundef_u8(), -1);
+}
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c
new file mode 100644
index 0000000000000..3b5596ac1d5a6
--- /dev/null
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c
@@ -0,0 +1,19 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -emit-llvm -o - %s
+
+#include <arm_sve.h>
+
+void missing_sve2p3_luti6(svint8x2_t table, svuint8_t indices) {
+ svluti6_s8(table, indices); // expected-error {{'svluti6_s8' needs target feature sve,sve2p3}}
+}
+
+__attribute__((target("sve2p3")))
+svint8_t has_sve2p3_luti6(svint8x2_t table, svuint8_t indices) {
+ return svluti6_s8(table, indices);
+}
+
+__attribute__((target("sve2p3")))
+svfloat32_t has_sve2p3_implied_sve2p2(svbool_t pg, svfloat16_t op) {
+ return svcvtlt_f32_f16_z(pg, op);
+}
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
new file mode 100644
index 0000000000000..6a2465f4027fc
--- /dev/null
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
@@ -0,0 +1,14 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -emit-llvm -o - %s
+
+#include <arm_sve.h>
+
+svfloat16_t missing_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
+ return svluti6_lane_f16_x2(table, indices, 1); // expected-error {{'svluti6_lane_f16_x2' needs target feature (sve,sve2p3)|(sme,sme2p3)}}
+}
+
+__attribute__((target("sve2p3")))
+svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
+ return svluti6_lane_f16_x2(table, indices, 0);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index e2734d0d54016..9228555db06d2 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1054,6 +1054,7 @@ def llvm_nxv4i1_ty : LLVMType<nxv4i1>;
def llvm_nxv8i1_ty : LLVMType<nxv8i1>;
def llvm_nxv16i1_ty : LLVMType<nxv16i1>;
def llvm_nxv16i8_ty : LLVMType<nxv16i8>;
+def llvm_nxv8i16_ty : LLVMType<nxv8i16>;
def llvm_nxv4i32_ty : LLVMType<nxv4i32>;
def llvm_nxv2i64_ty : LLVMType<nxv2i64>;
def llvm_nxv8f16_ty : LLVMType<nxv8f16>;
@@ -2800,12 +2801,31 @@ def int_aarch64_sve_tbx : AdvSIMD_SVE2_TBX_Intrinsic<[IntrSpeculatable]>;
def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic<[IntrSpeculatable]>;
def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic<[IntrSpeculatable]>;
+def int_aarch64_sve_luti6 : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
+ [llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty],
+ [IntrNoMem, IntrSpeculatable]>;
def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>,
LLVMMatchType<0>,
llvm_nxv16i8_ty,
llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
+def int_aarch64_sve_luti6_lane_x2_i16
+ : DefaultAttrsIntrinsic<[llvm_nxv8i16_ty],
+ [llvm_nxv8i16_ty,
+ llvm_nxv8i16_ty,
+ llvm_nxv16i8_ty,
+ llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
+def int_aarch64_sve_luti6_lane_x2_f16
+ : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty],
+ [llvm_nxv8f16_ty,
+ llvm_nxv8f16_ty,
+ llvm_nxv16i8_ty,
+ llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
//
// SVE2 - Optional bit permutation
@@ -3960,6 +3980,9 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_luti4_lane_zt
: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrInaccessibleMemOnly, IntrReadMem]>;
+ def int_aarch64_sme_luti6_zt
+ : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty], [llvm_i32_ty, llvm_nxv16i8_ty],
+ [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrReadMem]>;
// Lookup table expand two registers
//
@@ -3981,11 +4004,20 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrInaccessibleMemOnly, IntrReadMem]>;
+ def int_aarch64_sme_luti6_lane_x4
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
+ [ImmArg<ArgIndex<4>>, IntrNoMem, IntrSpeculatable]>;
def int_aarch64_sme_luti4_zt_x4
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
[ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrReadMem]>;
+ def int_aarch64_sme_luti6_zt_x4
+ : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty, llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+ [llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+ [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrReadMem]>;
//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 09ff2209a67eb..ce793f304791e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -414,8 +414,12 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm);
+ void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs,
+ unsigned Opc, uint32_t MaxImm);
void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc);
+ void SelectMultiVectorLutiZT(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
+ unsigned NumInVecs);
template <unsigned MaxIdx, unsigned Scale>
bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
@@ -2255,6 +2259,51 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
CurDAG->RemoveDeadNode(Node);
}
+void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
+ unsigned NumOutVecs,
+ unsigned Opc,
+ uint32_t MaxImm) {
+ const bool HasChain = Node->getOpcode() == ISD::INTRINSIC_W_CHAIN;
+ const unsigned BaseOp = HasChain ? 1 : 0;
+ const unsigned t0 = BaseOp + 1;
+ const unsigned t1 = BaseOp + 2;
+ const unsigned i0 = BaseOp + 3;
+ const unsigned i1 = BaseOp + 4;
+ const unsigned ImmOp = BaseOp + 5;
+
+ SDValue ImmVal = Node->getOperand(ImmOp);
+ if (auto *Imm = dyn_cast<ConstantSDNode>(ImmVal))
+ if (Imm->getZExtValue() > MaxImm)
+ return;
+
+ SDLoc DL(Node);
+ EVT VT = Node->getValueType(0);
+ SmallVector<SDValue, 4> Ops = {
+ createZTuple({Node->getOperand(t0), Node->getOperand(t1)}),
+ createZTuple({Node->getOperand(i0), Node->getOperand(i1)}),
+ Node->getOperand(ImmOp),
+ };
+
+ SDNode *Instruction;
+ if (HasChain) {
+ Ops.push_back(Node->getOperand(0));
+ Instruction =
+ CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
+ } else {
+ Instruction = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops);
+ }
+ SDValue SuperReg(Instruction, 0);
+
+ for (unsigned i = 0; i < NumOutVecs; ++i)
+ ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg(
+ AArch64::zsub0 + i, DL, VT, SuperReg));
+
+ if (HasChain)
+ ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1));
+
+ CurDAG->RemoveDeadNode(Node);
+}
+
void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc) {
@@ -2284,6 +2333,50 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
CurDAG->RemoveDeadNode(Node);
}
+void AArch64DAGToDAGISel::SelectMultiVectorLutiZT(SDNode *Node,
+ unsigned NumOutVecs,
+ unsigned Opc,
+ unsigned NumInVecs) {
+ const unsigned ChainOp = 0;
+ const unsigned ZtOp = 2;
+ const unsigned FirstVecOp = 3;
+
+ SDValue ZtValue;
+ if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(ZtOp), ZtValue))
+ return;
+
+ SDValue ZTuple;
+ switch (NumInVecs) {
+ case 2:
+ ZTuple = createZMulTuple(
+ {Node->getOperand(FirstVecOp), Node->getOperand(FirstVecOp + 1)});
+ break;
+ case 3:
+ ZTuple = createZTuple({Node->getOperand(FirstVecOp),
+ Node->getOperand(FirstVecOp + 1),
+ Node->getOperand(FirstVecOp + 2)});
+ break;
+ default:
+ llvm_unreachable("unexpected LUTI ZT tuple width");
+ }
+
+ SDValue Ops[] = {ZtValue, ZTuple, Node->getOperand(ChainOp)};
+
+ SDLoc DL(Node);
+ EVT VT = Node->getValueType(0);
+
+ SDNode *Instruction =
+ CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
+ SDValue SuperReg(Instruction, 0);
+
+ for (unsigned i = 0; i < NumOutVecs; ++i)
+ ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg(
+ AArch64::zsub0 + i, DL, VT, SuperReg));
+
+ ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1));
+ CurDAG->RemoveDeadNode(Node);
+}
+
void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs,
unsigned Op) {
SDLoc DL(N);
@@ -5916,6 +6009,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z);
return;
}
+ case Intrinsic::aarch64_sme_luti6_zt_x4: {
+ SelectMultiVectorLutiZT(Node, 4, AArch64::LUTI6_4ZT3Z, 3);
+ return;
+ }
case Intrinsic::aarch64_sve_fp8_cvtl1_x2:
if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::FP>(
Node->getValueType(0),
@@ -6006,6 +6103,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::SRSHL_VG4_4ZZ_S, AArch64::SRSHL_VG4_4ZZ_D}))
SelectDestructiveMultiIntrinsic(Node, 4, false, Op);
return;
+ case Intrinsic::aarch64_sme_luti6_lane_x4:
+ if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
+ Node->getValueType(0), {0, AArch64::LUTI6_4Z2Z2ZI, 0}))
+ SelectMultiVectorLutiLaneTuple(Node, 4, Opc, 1);
+ return;
case Intrinsic::aarch64_sve_urshl_single_x2:
if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
Node->getValueType(0),
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 129e49f645633..346fe850576e8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -229,7 +229,7 @@ def HasF16MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasF
AssemblerPredicateWithAll<(all_of FeatureF16MM), "f16mm">;
def HasSVE2p3 : Predicate<"Subtarget->hasSVE2p3()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">;
-def HasSME2p3 : Predicate<"Subtarget->hasSME2p3()">,
+def HasSME2p3 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p3()">,
AssemblerPredicateWithAll<(all_of FeatureSME2p3), "sme2p3">;
def HasF16F32DOT : Predicate<"Subtarget->hasF16F32DOT()">,
AssemblerPredicateWithAll<(all_of FeatureF16F32DOT), "f16f32dot">;
@@ -313,6 +313,14 @@ def HasNonStreamingSVE2p2_or_SME2p2
"(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2),
"sme2p2 or sve2p2">;
+def HasNonStreamingSVE2p3
+ : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p3()">,
+ AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">;
+def HasNonStreamingSVE2p3_or_SME2p3
+ : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p3()) ||"
+ "(Subtarget->isStreaming() && Subtarget->hasSME2p3())">,
+ AssemblerPredicateWithAll<(any_of FeatureSVE2p3, FeatureSME2p3),
+ "sme2p3 or sve2p3">;
def HasSMEF16F16_or_SMEF8F16
: Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">,
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 022fed6473486..9afeae7c25de1 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -1141,6 +1141,9 @@ let Predicates = [HasSME_MOP4, HasSMEF64F64] in {
//===----------------------------------------------------------------------===//
let Predicates = [HasSME2p3] in {
def LUTI6_ZTZ : sme2_lut_single<"luti6">;
+ def : Pat<(nxv16i8 (int_aarch64_sme_luti6_zt (imm_to_zt untyped:$zt),
+ nxv16i8:$zn)),
+ (LUTI6_ZTZ $zt, nxv16i8:$zn)>;
def LUTI6_4ZT3Z : sme2_luti6_zt_consecutive<"luti6">;
def LUTI6_S_4ZT3Z : sme2_luti6_zt_strided<"luti6">;
def LUTI6_4Z2Z2ZI : sme2_luti6_vector_vg4_consecutive<"luti6">;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 436f50fe451be..dc3854f9887fb 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4841,14 +4841,22 @@ let Predicates = [HasSVE2p3_or_SME2p3] in {
defm SQSHRN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"sqshrn", 0b000, null_frag>;
defm UQSHRN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"uqshrn", 0b010, null_frag>;
- defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">;
} // End HasSME2p3orSVE2p3
+let Predicates = [HasNonStreamingSVE2p3_or_SME2p3] in {
+ defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">;
+}
+
//===----------------------------------------------------------------------===//
// SVE2.3 instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasSVE2p3] in {
+let Predicates = [HasNonStreamingSVE2p3] in {
def LUTI6_Z2ZZ : sve2_luti6_vector<"luti6">;
+ def : Pat<(nxv16i8 (int_aarch64_sve_luti6 nxv16i8:$Op1, nxv16i8:$Op2,
+ nxv16i8:$Op3)),
+ (LUTI6_Z2ZZ (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0,
+ nxv16i8:$Op2, zsub1),
+ nxv16i8:$Op3)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 72434fa86e04c..7c09937020d2f 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11384,6 +11384,17 @@ multiclass sve2_luti6_vector_index<string mnemonic> {
bit idx;
let Inst{23} = idx;
}
+
+ def : Pat<(nxv8i16 (int_aarch64_sve_luti6_lane_x2_i16 nxv8i16:$Op1, nxv8i16:$Op2,
+ nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
+ (nxv8i16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
+ nxv8i16:$Op2, zsub1),
+ nxv16i8:$Op3, timm32_0_1:$Op4))>;
+ def : Pat<(nxv8f16 (int_aarch64_sve_luti6_lane_x2_f16 nxv8f16:$Op1, nxv8f16:$Op2,
+ nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
+ (nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
+ nxv8f16:$Op2, zsub1),
+ nxv16i8:$Op3, timm32_0_1:$Op4))>;
}
// Look up table
diff --git a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
new file mode 100644
index 0000000000000..07fb62baa58cd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -force-streaming -mtriple=aarch64-none-linux-gnu -mattr=+sme2p3 < %s | FileCheck %s
+
+target triple = "aarch64-none-linux-gnu"
+
+define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 {
+; CHECK-LABEL: luti6_zt_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti6 z0.b, zt0, z0
+; CHECK-NEXT: ret
+ %res = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(
+ i32 0, <vscale x 16 x i8> %x)
+ ret <vscale x 16 x i8> %res
+}
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
+ <vscale x 16 x i8> } @luti6_zt_i8_x4(<vscale x 16 x i8> %a,
+ <vscale x 16 x i8> %b,
+ <vscale x 16 x i8> %c) #0 {
+; CHECK-LABEL: luti6_zt_i8_x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z0 - z2 }
+; CHECK-NEXT: ret
+ %res = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>,
+ <vscale x 16 x i8>, <vscale x 16 x i8> }
+ @llvm.aarch64.sme.luti6.zt.x4(
+ i32 0, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
+ <vscale x 16 x i8> %c)
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
+ <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16> } @luti6_i16_x4(<vscale x 8 x i16> %a,
+ <vscale x 8 x i16> %b,
+ <vscale x 16 x i8> %x,
+ <vscale x 16 x i8> %y) #0 {
+; CHECK-LABEL: luti6_i16_x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[1]
+; CHECK-NEXT: ret
+ %res = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16>, <vscale x 8 x i16> }
+ @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
+ <vscale x 8 x i16> %a, <vscale x 8 x i16> %b,
+ <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1)
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat> } @luti6_bf16_x4(<vscale x 8 x bfloat> %a,
+ <vscale x 8 x bfloat> %b,
+ <vscale x 16 x i8> %x,
+ <vscale x 16 x i8> %y) #0 {
+; CHECK-LABEL: luti6_bf16_x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[0]
+; CHECK-NEXT: ret
+ %res = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }
+ @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(
+ <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b,
+ <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 0)
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
+ <vscale x 8 x half> } @luti6_f16_x4(<vscale x 8 x half> %a,
+ <vscale x 8 x half> %b,
+ <vscale x 16 x i8> %x,
+ <vscale x 16 x i8> %y) #0 {
+; CHECK-LABEL: luti6_f16_x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[1]
+; CHECK-NEXT: ret
+ %res = tail call { <vscale x 8 x half>, <vscale x 8 x half>,
+ <vscale x 8 x half>, <vscale x 8 x half> }
+ @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(
+ <vscale x 8 x half> %a, <vscale x 8 x half> %b,
+ <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1)
+ ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
+ <vscale x 8 x half> } %res
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(
+ i32, <vscale x 16 x i8>)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
+ <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(
+ i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
+ <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>,
+ <vscale x 16 x i8>, i32)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(
+ <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 16 x i8>,
+ <vscale x 16 x i8>, i32)
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
+ <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(
+ <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>,
+ <vscale x 16 x i8>, i32)
+
+attributes #0 = { "target-features"="+sme2p3" }
diff --git a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
new file mode 100644
index 0000000000000..ab89e87df66d2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve2p3 < %s | FileCheck %s
+
+target triple = "aarch64-none-linux-gnu"
+
+define <vscale x 16 x i8> @luti6_i8(<vscale x 16 x i8> %a,
+; CHECK-LABEL: luti6_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: luti6 z0.b, { z0.b, z1.b }, z2
+; CHECK-NEXT: ret
+ <vscale x 16 x i8> %b,
+ <vscale x 16 x i8> %idx) {
+ %res = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(
+ <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %idx)
+ ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @luti6_i16_x2(<vscale x 8 x i16> %a,
+; CHECK-LABEL: luti6_i16_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: luti6 z0.h, { z0.h, z1.h }, z2[1]
+; CHECK-NEXT: ret
+ <vscale x 8 x i16> %b,
+ <vscale x 16 x i8> %idx) {
+ %res = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
+ <vscale x 8 x i16> %a, <vscale x 8 x i16> %b,
+ <vscale x 16 x i8> %idx, i32 1)
+ ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x half> @luti6_f16_x2(<vscale x 8 x half> %a,
+; CHECK-LABEL: luti6_f16_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: luti6 z0.h, { z0.h, z1.h }, z2[0]
+; CHECK-NEXT: ret
+ <vscale x 8 x half> %b,
+ <vscale x 16 x i8> %idx) {
+ %res = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
+ <vscale x 8 x half> %a, <vscale x 8 x half> %b,
+ <vscale x 16 x i8> %idx, i32 0)
+ ret <vscale x 8 x half> %res
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.luti6(
+ <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
+ <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
+ <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>, i32)
diff --git a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
new file mode 100644
index 0000000000000..0777c1db532b1
--- /dev/null
+++ b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
@@ -0,0 +1,79 @@
+; RUN: not opt -S -passes=verify < %s 2>&1 | FileCheck %s
+
+define <vscale x 8 x i16> @bad_sve_luti6_ret(<vscale x 16 x i8> %a,
+ <vscale x 16 x i8> %b,
+ <vscale x 16 x i8> %idx) {
+; CHECK: Intrinsic has incorrect return type!
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6(
+ <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %idx)
+ ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x i16> @bad_sve_luti6_lane_x2_arg(<vscale x 4 x i32> %a,
+ <vscale x 8 x i16> %b,
+ <vscale x 16 x i8> %idx) {
+; CHECK: Intrinsic has incorrect argument type!
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
+ <vscale x 4 x i32> %a, <vscale x 8 x i16> %b,
+ <vscale x 16 x i8> %idx, i32 1)
+ ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x half> @bad_sve_luti6_lane_x2_f16_arg(
+ <vscale x 8 x i16> %a, <vscale x 8 x half> %b, <vscale x 16 x i8> %idx) {
+; CHECK: Intrinsic has incorrect argument type!
+ %res = call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
+ <vscale x 8 x i16> %a, <vscale x 8 x half> %b,
+ <vscale x 16 x i8> %idx, i32 1)
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 8 x i16> @bad_sme_luti6_zt_ret(i32 %zt,
+ <vscale x 16 x i8> %idx) {
+; CHECK: Intrinsic has incorrect return type!
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(
+ i32 %zt, <vscale x 16 x i8> %idx)
+ ret <vscale x 8 x i16> %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16> } @bad_sme_luti6_zt_x4_ret(i32 %zt,
+ <vscale x 16 x i8> %a,
+ <vscale x 16 x i8> %b,
+ <vscale x 16 x i8> %c) {
+; CHECK: Intrinsic has incorrect return type!
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(
+ i32 %zt, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
+ <vscale x 16 x i8> %c)
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16> } @bad_sme_luti6_lane_x4_arg(
+ <vscale x 8 x half> %a, <vscale x 8 x i16> %b,
+ <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; CHECK: Intrinsic has incorrect argument type!
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
+ <vscale x 8 x half> %a, <vscale x 8 x i16> %b,
+ <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1)
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16> } %res
+}
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6(
+ <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
+ <vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
+ <vscale x 8 x i16>, <vscale x 8 x half>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(i32, <vscale x 16 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(
+ i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
+ <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
+ <vscale x 8 x half>, <vscale x 8 x i16>, <vscale x 16 x i8>,
+ <vscale x 16 x i8>, i32)
>From 7e039cc5705e8a3a5e0ed5bcda63f76b0c231953 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 19 Mar 2026 16:39:11 +0000
Subject: [PATCH 02/14] fixup! Address PR comments
---
clang/lib/Basic/Targets/AArch64.cpp | 29 ------------------
clang/lib/Basic/Targets/AArch64.h | 2 --
.../Preprocessor/aarch64-target-features.c | 23 --------------
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 5 +---
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 7 +----
llvm/lib/Target/AArch64/SMEInstrFormats.td | 22 ++++++++------
llvm/lib/Target/AArch64/SVEInstrFormats.td | 30 +++++++++++--------
7 files changed, 33 insertions(+), 85 deletions(-)
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index a79075622ce1d..9b951e69cce33 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -500,9 +500,6 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
if (HasSVE2p1)
Builder.defineMacro("__ARM_FEATURE_SVE2p1", "1");
- if (HasSVE2p3)
- Builder.defineMacro("__ARM_FEATURE_SVE2p3", "1");
-
if (HasSVE2 && HasSVEAES)
Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1");
@@ -529,9 +526,6 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
if (HasSME2p1)
Builder.defineMacro("__ARM_FEATURE_SME2p1", "1");
- if (HasSME2p3)
- Builder.defineMacro("__ARM_FEATURE_SME2p3", "1");
-
if (HasSMEF16F16)
Builder.defineMacro("__ARM_FEATURE_SME_F16F16", "1");
@@ -913,11 +907,9 @@ void AArch64TargetInfo::computeFeatureLookup() {
.Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3)
.Case("sve2-sm4", FPU & SveMode && HasSVE2SM4)
.Case("sve2p1", FPU & SveMode && HasSVE2p1)
- .Case("sve2p3", FPU & SveMode && HasSVE2p3)
.Case("sme", HasSME)
.Case("sme2", HasSME2)
.Case("sme2p1", HasSME2p1)
- .Case("sme2p3", HasSME2p3)
.Case("sme-f64f64", HasSMEF64F64)
.Case("sme-i16i64", HasSMEI16I64)
.Case("sme-fa64", HasSMEFA64)
@@ -1023,15 +1015,6 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasSVE2 = true;
HasSVE2p1 = true;
}
- if (Feature == "+sve2p3") {
- FPU |= NeonMode;
- FPU |= SveMode;
- HasFullFP16 = true;
- HasSVE2 = true;
- HasSVE2p1 = true;
- HasSVE2p2 = true;
- HasSVE2p3 = true;
- }
if (Feature == "+sve-aes") {
FPU |= NeonMode;
HasFullFP16 = true;
@@ -1088,18 +1071,6 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasBFloat16 = true;
HasFullFP16 = true;
}
- if (Feature == "+sme2p3") {
- HasSME = true;
- HasSME2 = true;
- HasSVE2 = true;
- HasSVE2p1 = true;
- HasSVE2p2 = true;
- HasSME2p1 = true;
- HasSME2p2 = true;
- HasSME2p3 = true;
- HasBFloat16 = true;
- HasFullFP16 = true;
- }
if (Feature == "+sme-f64f64") {
HasSME = true;
HasSMEF64F64 = true;
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index b3c722a7f6d74..0a29bad81939b 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -86,7 +86,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
bool HasBFloat16 = false;
bool HasSVE2 = false;
bool HasSVE2p1 = false;
- bool HasSVE2p3 = false;
bool HasSVEAES = false;
bool HasSVE2SHA3 = false;
bool HasSVE2SM4 = false;
@@ -112,7 +111,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
bool HasSMEF16F16 = false;
bool HasSMEB16B16 = false;
bool HasSME2p1 = false;
- bool HasSME2p3 = false;
bool HasFP8 = false;
bool HasFP8FMA = false;
bool HasFP8DOT2 = false;
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 6316b25befed8..60ddaad639d48 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -827,32 +827,9 @@
// CHECK-SVE2p2: __ARM_NEON_FP 0xE
// CHECK-SVE2p2: __ARM_NEON_SVE_BRIDGE 1
//
-// RUN: %clang -target aarch64-none-linux-gnu -march=armv9.7-a+sve2p3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2p3 %s
-// CHECK-SVE2p3: __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
-// CHECK-SVE2p3: __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
-// CHECK-SVE2p3: __ARM_FEATURE_SVE 1
-// CHECK-SVE2p3: __ARM_FEATURE_SVE2 1
-// CHECK-SVE2p3: __ARM_FEATURE_SVE2p1 1
-// CHECK-SVE2p3: __ARM_FEATURE_SVE2p2 1
-// CHECK-SVE2p3: __ARM_FEATURE_SVE2p3 1
-// CHECK-SVE2p3: __ARM_NEON 1
-// CHECK-SVE2p3: __ARM_NEON_FP 0xE
-// CHECK-SVE2p3: __ARM_NEON_SVE_BRIDGE 1
-// CHECK-SVE2p3-NOT: __ARM_FEATURE_SME2p3 1
-//
// RUN: %clang --target=aarch64 -march=armv9-a+sme2p2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p2 %s
// CHECK-SME2p2: __ARM_FEATURE_LOCALLY_STREAMING 1
// CHECK-SME2p2: __ARM_FEATURE_SME 1
// CHECK-SME2p2: __ARM_FEATURE_SME2 1
// CHECK-SME2p2: __ARM_FEATURE_SME2p1 1
// CHECK-SME2p2: __ARM_FEATURE_SME2p2 1
-//
-// RUN: %clang --target=aarch64 -march=armv9.7-a+sme2p3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p3 %s
-// CHECK-SME2p3: __ARM_FEATURE_LOCALLY_STREAMING 1
-// CHECK-SME2p3: __ARM_FEATURE_SME 1
-// CHECK-SME2p3: __ARM_FEATURE_SME2 1
-// CHECK-SME2p3: __ARM_FEATURE_SME2p1 1
-// CHECK-SME2p3: __ARM_FEATURE_SME2p2 1
-// CHECK-SME2p3: __ARM_FEATURE_SME2p3 1
-// CHECK-SME2p3: __ARM_FEATURE_SVE2p1 1
-// CHECK-SME2p3: __ARM_FEATURE_SVE2p2 1
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 9afeae7c25de1..d0eb9ca218a27 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -1140,10 +1140,7 @@ let Predicates = [HasSME_MOP4, HasSMEF64F64] in {
// SME2.3 instructions
//===----------------------------------------------------------------------===//
let Predicates = [HasSME2p3] in {
- def LUTI6_ZTZ : sme2_lut_single<"luti6">;
- def : Pat<(nxv16i8 (int_aarch64_sme_luti6_zt (imm_to_zt untyped:$zt),
- nxv16i8:$zn)),
- (LUTI6_ZTZ $zt, nxv16i8:$zn)>;
+ defm LUTI6_ZTZ : sme2_lut_single<"luti6", int_aarch64_sme_luti6_zt>;
def LUTI6_4ZT3Z : sme2_luti6_zt_consecutive<"luti6">;
def LUTI6_S_4ZT3Z : sme2_luti6_zt_strided<"luti6">;
def LUTI6_4Z2Z2ZI : sme2_luti6_vector_vg4_consecutive<"luti6">;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index dc3854f9887fb..00734da5422e1 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4851,12 +4851,7 @@ let Predicates = [HasNonStreamingSVE2p3_or_SME2p3] in {
// SVE2.3 instructions
//===----------------------------------------------------------------------===//
let Predicates = [HasNonStreamingSVE2p3] in {
- def LUTI6_Z2ZZ : sve2_luti6_vector<"luti6">;
- def : Pat<(nxv16i8 (int_aarch64_sve_luti6 nxv16i8:$Op1, nxv16i8:$Op2,
- nxv16i8:$Op3)),
- (LUTI6_Z2ZZ (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0,
- nxv16i8:$Op2, zsub1),
- nxv16i8:$Op3)>;
+ defm LUTI6_Z2ZZ : sve2_luti6_vector<"luti6", int_aarch64_sve_luti6>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 99836aeed7c0a..e17b1b2e6c7a5 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -3921,15 +3921,19 @@ multiclass sme2_luti4_vector_vg4_index<string mnemonic> {
}
// 8-bit Look up table
-class sme2_lut_single<string asm>
- : I<(outs ZPR8:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn),
- asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> {
- bits<0> ZTt;
- bits<5> Zd;
- bits<5> Zn;
- let Inst{31-10} = 0b1100000011001000010000;
- let Inst{9-5} = Zn;
- let Inst{4-0} = Zd;
+multiclass sme2_lut_single<string asm, SDPatternOperator intrinsic> {
+ def NAME : I<(outs ZPR8:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn),
+ asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> {
+ bits<0> ZTt;
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-10} = 0b1100000011001000010000;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+ }
+
+ def : Pat<(nxv16i8 (intrinsic (imm_to_zt untyped:$zt), nxv16i8:$zn)),
+ (!cast<Instruction>(NAME) $zt, nxv16i8:$zn)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 7c09937020d2f..f10e9743181d0 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11398,18 +11398,24 @@ multiclass sve2_luti6_vector_index<string mnemonic> {
}
// Look up table
-class sve2_luti6_vector<string mnemonic>
- : I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, ZPRAny:$Zm),
- mnemonic, "\t$Zd, $Zn, $Zm",
- "", []>, Sched<[]> {
- bits<5> Zd;
- bits<5> Zn;
- bits<5> Zm;
- let Inst{31-21} = 0b01000101001;
- let Inst{20-16} = Zm;
- let Inst{15-10} = 0b101011;
- let Inst{9-5} = Zn;
- let Inst{4-0} = Zd;
+multiclass sve2_luti6_vector<string mnemonic, SDPatternOperator intrinsic> {
+ def NAME : I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, ZPRAny:$Zm),
+ mnemonic, "\t$Zd, $Zn, $Zm",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-21} = 0b01000101001;
+ let Inst{20-16} = Zm;
+ let Inst{15-10} = 0b101011;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+ }
+
+ def : Pat<(nxv16i8 (intrinsic nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)),
+ (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0,
+ nxv16i8:$Op2, zsub1),
+ nxv16i8:$Op3)>;
}
//===----------------------------------------------------------------------===//
>From c91605be19d38d882dfae2993de4fa90ba73061c Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 19 Mar 2026 19:39:05 +0000
Subject: [PATCH 03/14] fixup! Reuse SelectMultiVectorLuti()
---
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 42 +++----------------
1 file changed, 6 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index ce793f304791e..b0ab89b2c0d64 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -417,9 +417,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm);
- void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc);
- void SelectMultiVectorLutiZT(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
- unsigned NumInVecs);
+ void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
+ unsigned NumInVecs);
template <unsigned MaxIdx, unsigned Scale>
bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
@@ -2306,37 +2305,8 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
unsigned NumOutVecs,
- unsigned Opc) {
- SDValue ZtValue;
- if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
- return;
-
- SDValue Chain = Node->getOperand(0);
- SDValue Ops[] = {ZtValue,
- createZMulTuple({Node->getOperand(3), Node->getOperand(4)}),
- Chain};
-
- SDLoc DL(Node);
- EVT VT = Node->getValueType(0);
-
- SDNode *Instruction =
- CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
- SDValue SuperReg = SDValue(Instruction, 0);
-
- for (unsigned I = 0; I < NumOutVecs; ++I)
- ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
- AArch64::zsub0 + I, DL, VT, SuperReg));
-
- // Copy chain
- unsigned ChainIdx = NumOutVecs;
- ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1));
- CurDAG->RemoveDeadNode(Node);
-}
-
-void AArch64DAGToDAGISel::SelectMultiVectorLutiZT(SDNode *Node,
- unsigned NumOutVecs,
- unsigned Opc,
- unsigned NumInVecs) {
+ unsigned Opc,
+ unsigned NumInVecs) {
const unsigned ChainOp = 0;
const unsigned ZtOp = 2;
const unsigned FirstVecOp = 3;
@@ -6006,11 +5976,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
}
case Intrinsic::aarch64_sme_luti4_zt_x4: {
- SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z);
+ SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z, 2);
return;
}
case Intrinsic::aarch64_sme_luti6_zt_x4: {
- SelectMultiVectorLutiZT(Node, 4, AArch64::LUTI6_4ZT3Z, 3);
+ SelectMultiVectorLuti(Node, 4, AArch64::LUTI6_4ZT3Z, 3);
return;
}
case Intrinsic::aarch64_sve_fp8_cvtl1_x2:
>From e405fbfbd44b0aff68cfd27411c4a0a6682b360f Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 19 Mar 2026 19:49:15 +0000
Subject: [PATCH 04/14] fixup! Add overloaded
AArch64DAGToDAGISel::EmitMultiVectorLutiLane() for reuse
---
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 89 ++++++++-----------
1 file changed, 39 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index b0ab89b2c0d64..1c4ca95b13561 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -414,6 +414,9 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm);
+ void EmitMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
+ unsigned Opc, ArrayRef<SDValue> Ops,
+ bool HasChain);
void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm);
@@ -2227,6 +2230,35 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode);
}
+void AArch64DAGToDAGISel::EmitMultiVectorLutiLane(SDNode *Node,
+ unsigned NumOutVecs,
+ unsigned Opc,
+ ArrayRef<SDValue> Ops,
+ bool HasChain) {
+ SDLoc DL(Node);
+ EVT VT = Node->getValueType(0);
+
+ SmallVector<SDValue, 4> MachineOps(Ops);
+ SDNode *Instruction;
+ if (HasChain) {
+ MachineOps.push_back(Node->getOperand(0));
+ Instruction =
+ CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, MachineOps);
+ } else {
+ Instruction = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, MachineOps);
+ }
+ SDValue SuperReg(Instruction, 0);
+
+ for (unsigned i = 0; i < NumOutVecs; ++i)
+ ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg(
+ AArch64::zsub0 + i, DL, VT, SuperReg));
+
+ if (HasChain)
+ ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1));
+
+ CurDAG->RemoveDeadNode(Node);
+}
+
void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc,
@@ -2239,68 +2271,25 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
return;
- SDValue Chain = Node->getOperand(0);
- SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4), Chain};
- SDLoc DL(Node);
- EVT VT = Node->getValueType(0);
-
- SDNode *Instruction =
- CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
- SDValue SuperReg = SDValue(Instruction, 0);
-
- for (unsigned I = 0; I < NumOutVecs; ++I)
- ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
- AArch64::zsub0 + I, DL, VT, SuperReg));
-
- // Copy chain
- unsigned ChainIdx = NumOutVecs;
- ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1));
- CurDAG->RemoveDeadNode(Node);
+ SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
+ EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops, /*HasChain=*/true);
}
void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc,
uint32_t MaxImm) {
- const bool HasChain = Node->getOpcode() == ISD::INTRINSIC_W_CHAIN;
- const unsigned BaseOp = HasChain ? 1 : 0;
- const unsigned t0 = BaseOp + 1;
- const unsigned t1 = BaseOp + 2;
- const unsigned i0 = BaseOp + 3;
- const unsigned i1 = BaseOp + 4;
- const unsigned ImmOp = BaseOp + 5;
-
- SDValue ImmVal = Node->getOperand(ImmOp);
+ SDValue ImmVal = Node->getOperand(5);
if (auto *Imm = dyn_cast<ConstantSDNode>(ImmVal))
if (Imm->getZExtValue() > MaxImm)
return;
- SDLoc DL(Node);
- EVT VT = Node->getValueType(0);
SmallVector<SDValue, 4> Ops = {
- createZTuple({Node->getOperand(t0), Node->getOperand(t1)}),
- createZTuple({Node->getOperand(i0), Node->getOperand(i1)}),
- Node->getOperand(ImmOp),
+ createZTuple({Node->getOperand(1), Node->getOperand(2)}),
+ createZTuple({Node->getOperand(3), Node->getOperand(4)}),
+ Node->getOperand(5),
};
-
- SDNode *Instruction;
- if (HasChain) {
- Ops.push_back(Node->getOperand(0));
- Instruction =
- CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
- } else {
- Instruction = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops);
- }
- SDValue SuperReg(Instruction, 0);
-
- for (unsigned i = 0; i < NumOutVecs; ++i)
- ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg(
- AArch64::zsub0 + i, DL, VT, SuperReg));
-
- if (HasChain)
- ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1));
-
- CurDAG->RemoveDeadNode(Node);
+ EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops, /*HasChain=*/false);
}
void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
>From af411fa1dd8bd782475f2e0b0ee505b44fe30a59 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 23 Mar 2026 15:51:29 +0000
Subject: [PATCH 05/14] fixup! Address PR comments
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 11 +----------
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 4 ++--
2 files changed, 3 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 346fe850576e8..766007b3996de 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -227,7 +227,7 @@ def HasSVE_B16MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasS
AssemblerPredicateWithAll<(all_of FeatureSVE_B16MM), "sve-b16mm">;
def HasF16MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasF16MM()">,
AssemblerPredicateWithAll<(all_of FeatureF16MM), "f16mm">;
-def HasSVE2p3 : Predicate<"Subtarget->hasSVE2p3()">,
+def HasSVE2p3 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p3()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">;
def HasSME2p3 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p3()">,
AssemblerPredicateWithAll<(all_of FeatureSME2p3), "sme2p3">;
@@ -313,15 +313,6 @@ def HasNonStreamingSVE2p2_or_SME2p2
"(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2),
"sme2p2 or sve2p2">;
-def HasNonStreamingSVE2p3
- : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p3()">,
- AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">;
-def HasNonStreamingSVE2p3_or_SME2p3
- : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p3()) ||"
- "(Subtarget->isStreaming() && Subtarget->hasSME2p3())">,
- AssemblerPredicateWithAll<(any_of FeatureSVE2p3, FeatureSME2p3),
- "sme2p3 or sve2p3">;
-
def HasSMEF16F16_or_SMEF8F16
: Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">,
AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 00734da5422e1..f547038a1353b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4843,14 +4843,14 @@ let Predicates = [HasSVE2p3_or_SME2p3] in {
} // End HasSME2p3orSVE2p3
-let Predicates = [HasNonStreamingSVE2p3_or_SME2p3] in {
+let Predicates = [HasSVE2p3_or_SME2p3] in {
defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">;
}
//===----------------------------------------------------------------------===//
// SVE2.3 instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasNonStreamingSVE2p3] in {
+let Predicates = [HasSVE2p3] in {
defm LUTI6_Z2ZZ : sve2_luti6_vector<"luti6", int_aarch64_sve_luti6>;
}
>From 3fb6456438af0d969aac3293f351ac80668d3b78 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 23 Mar 2026 21:13:29 +0000
Subject: [PATCH 06/14] fixup! Address more PR comments
---
clang/include/clang/Basic/arm_sme.td | 1 -
clang/include/clang/Basic/arm_sve.td | 4 ++--
clang/lib/Sema/SemaARM.cpp | 24 ++++++++++++++++---
.../sme2p3-intrinsics/acle_sme2p3_luti6.c | 16 +++++++++----
.../sve2p3-intrinsics/acle_sve2p3_luti6.c | 12 +++++-----
.../acle_sme2p3_target_lane.c | 2 +-
.../acle_sve2p3_target_lane.c | 9 +++++++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 16 ++++---------
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 5 ++--
llvm/lib/Target/AArch64/SVEInstrFormats.td | 4 ++--
10 files changed, 59 insertions(+), 34 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 8de360fca5f5e..678fa1efc2a51 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -984,7 +984,6 @@ let SMETargetGuard = "sme-lutv2" in {
let SMETargetGuard = "sme2p3" in {
def SVLUTI6_ZT : SInst<"svluti6_zt_{d}", "diu", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
def SVLUTI6_ZT_X4 : SInst<"svluti6_zt_{d}_x4", "4i3.u", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
- def SVLUTI6_LANE_X4 : SInst<"svluti6_lane[_{d}_x4]", "42.d2.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
}
let SMETargetGuard = "sme-f8f32" in {
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 555d334638763..a835833681c24 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1881,8 +1881,8 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
}
let SVETargetGuard = "sve2p3", SMETargetGuard = "sme2p3" in {
- def SVLUTI6_x2_I16 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUs", MergeNone, "aarch64_sve_luti6_lane_x2_i16", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
- def SVLUTI6_x2_F16 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "h", MergeNone, "aarch64_sve_luti6_lane_x2_f16", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+ def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUsh", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+ def SVLUTI6_x4 : SInst<"svluti6_lane[_{d}_x4]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index e54c8228e5ff8..16d73f65fea67 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "clang/Sema/SemaARM.h"
+#include "clang/Basic/DiagnosticFrontend.h"
#include "clang/Basic/DiagnosticSema.h"
#include "clang/Basic/TargetBuiltins.h"
#include "clang/Basic/TargetInfo.h"
@@ -610,9 +611,26 @@ static bool checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
BuiltinType = SemaARM::ArmNonStreaming;
else if (SatisfiesSME)
BuiltinType = SemaARM::ArmStreaming;
- else
- // This should be diagnosed by CodeGen
- return false;
+ else {
+ switch (BuiltinID) {
+ case SVE::BI__builtin_sve_svluti6_lane_bf16_x4:
+ case SVE::BI__builtin_sve_svluti6_lane_f16_x4:
+ case SVE::BI__builtin_sve_svluti6_lane_s16_x4:
+ case SVE::BI__builtin_sve_svluti6_lane_u16_x4: {
+ std::string BuiltinName =
+ std::string(S.Context.BuiltinInfo.getQuotedName(BuiltinID));
+ const FunctionDecl *Callee = TheCall->getDirectCallee();
+ if (Callee)
+ BuiltinName = "'" + Callee->getName().str() + "'";
+ S.Diag(TheCall->getBeginLoc(), diag::err_builtin_needs_feature)
+ << BuiltinName << RequiredFeatures;
+ return true;
+ }
+ default:
+ // This should be diagnosed by CodeGen.
+ return false;
+ }
+ }
}
if (FnType != SemaARM::ArmNonStreaming &&
diff --git a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
index ae5fb1f64d0fc..02dac71bb8de7 100644
--- a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
@@ -3,10 +3,18 @@
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
#include <arm_sme.h>
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3_UNUSED,A4_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_s16_x4(
// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
@@ -21,7 +29,7 @@
//
svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return svluti6_lane_s16_x4(table, indices, 1);
+ return SVE_ACLE_FUNC(svluti6_lane,_s16,_x4,)(table, indices, 1);
}
// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_u16_x4(
@@ -38,7 +46,7 @@ svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
//
svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return svluti6_lane_u16_x4(table, indices, 0);
+ return SVE_ACLE_FUNC(svluti6_lane,_u16,_x4,)(table, indices, 0);
}
// CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti6_lane_f16_x4(
@@ -55,7 +63,7 @@ svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
//
svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return svluti6_lane_f16_x4(table, indices, 1);
+ return SVE_ACLE_FUNC(svluti6_lane,_f16,_x4,)(table, indices, 1);
}
// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti6_lane_bf16_x4(
@@ -72,7 +80,7 @@ svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
//
svbfloat16x4_t test_svluti6_lane_bf16_x4(svbfloat16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return svluti6_lane_bf16_x4(table, indices, 0);
+ return SVE_ACLE_FUNC(svluti6_lane,_bf16,_x4,)(table, indices, 0);
}
// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_s8(
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
index a806ef0b13c20..b70a83b91a5af 100644
--- a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
@@ -66,13 +66,13 @@ svmfloat8_t test_svluti6_mf8(svmfloat8x2_t table, svuint8_t indices) {
// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
//
svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) {
@@ -82,13 +82,13 @@ svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) {
// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
//
svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) {
@@ -98,13 +98,13 @@ svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) {
// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
// CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
//
svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) {
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
index 1a06663a9aab7..74bdbf8723fed 100644
--- a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
@@ -6,7 +6,7 @@
svbfloat16x4_t missing_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature sme,sme2p3}}
+ return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature (sve,sve2p3)|(sme,sme2p3)}}
}
__attribute__((target("sme2p3,bf16")))
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
index 6a2465f4027fc..2aec0c5daa039 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
@@ -12,3 +12,12 @@ __attribute__((target("sve2p3")))
svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
return svluti6_lane_f16_x2(table, indices, 0);
}
+
+svfloat16x4_t missing_sve2p3_luti6_lane_x4(svfloat16x2_t table, svuint8x2_t indices) {
+ return svluti6_lane_f16_x4(table, indices, 1); // expected-error {{'svluti6_lane_f16_x4' needs target feature (sve,sve2p3)|(sme,sme2p3)}}
+}
+
+__attribute__((target("sve2p3")))
+svfloat16x4_t has_sve2p3_luti6_lane_x4(svfloat16x2_t table, svuint8x2_t indices) {
+ return svluti6_lane_f16_x4(table, indices, 0);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 9228555db06d2..01e21bc0622b4 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2812,17 +2812,10 @@ def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
llvm_nxv16i8_ty,
llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
-def int_aarch64_sve_luti6_lane_x2_i16
- : DefaultAttrsIntrinsic<[llvm_nxv8i16_ty],
- [llvm_nxv8i16_ty,
- llvm_nxv8i16_ty,
- llvm_nxv16i8_ty,
- llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
-def int_aarch64_sve_luti6_lane_x2_f16
- : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty],
- [llvm_nxv8f16_ty,
- llvm_nxv8f16_ty,
+def int_aarch64_sve_luti6_lane_x2
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>,
+ LLVMMatchType<0>,
llvm_nxv16i8_ty,
llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
@@ -4328,4 +4321,3 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sve_pmlal_pair_x2 : DefaultAttrsIntrinsic<[llvm_nxv2i64_ty, llvm_nxv2i64_ty],
[llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty], [IntrNoMem]>;
}
-
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 1c4ca95b13561..142e177381be2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -414,9 +414,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm);
- void EmitMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
- unsigned Opc, ArrayRef<SDValue> Ops,
- bool HasChain);
+ void EmitMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
+ ArrayRef<SDValue> Ops, bool HasChain);
void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm);
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index f10e9743181d0..05b386664a191 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11385,12 +11385,12 @@ multiclass sve2_luti6_vector_index<string mnemonic> {
let Inst{23} = idx;
}
- def : Pat<(nxv8i16 (int_aarch64_sve_luti6_lane_x2_i16 nxv8i16:$Op1, nxv8i16:$Op2,
+ def : Pat<(nxv8i16 (int_aarch64_sve_luti6_lane_x2 nxv8i16:$Op1, nxv8i16:$Op2,
nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
(nxv8i16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
nxv8i16:$Op2, zsub1),
nxv16i8:$Op3, timm32_0_1:$Op4))>;
- def : Pat<(nxv8f16 (int_aarch64_sve_luti6_lane_x2_f16 nxv8f16:$Op1, nxv8f16:$Op2,
+ def : Pat<(nxv8f16 (int_aarch64_sve_luti6_lane_x2 nxv8f16:$Op1, nxv8f16:$Op2,
nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
(nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
nxv8f16:$Op2, zsub1),
>From 10e4c175df5910cc42b01a910e05125367ea7b3b Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 23 Mar 2026 22:55:41 +0000
Subject: [PATCH 07/14] fixup! Fix final PR comments for now
---
clang/include/clang/Basic/arm_sve.td | 2 +-
.../sve2p3-intrinsics/acle_sve2p3_luti6.c | 126 ++++++++++++++++--
llvm/lib/Target/AArch64/SVEInstrFormats.td | 5 +
.../test/Verifier/AArch64/luti6-intrinsics.ll | 1 +
4 files changed, 125 insertions(+), 9 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index a835833681c24..b47f6ab0aae57 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1881,7 +1881,7 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
}
let SVETargetGuard = "sve2p3", SMETargetGuard = "sme2p3" in {
- def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUsh", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+ def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
def SVLUTI6_x4 : SInst<"svluti6_lane[_{d}_x4]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
}
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
index b70a83b91a5af..59d24b641f9d5 100644
--- a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
@@ -1,11 +1,15 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s --check-prefix=STREAM-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=STREAM-CPP-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s --check-prefix=STREAM-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=STREAM-CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
#include <arm_sve.h>
@@ -15,18 +19,36 @@
#define SVE_ACLE_FUNC(A1, A2) A1##A2
#endif
+#ifdef STREAMING_MODE
+#define STREAMING_ATTR __arm_streaming
+#else
+#define STREAMING_ATTR
+#endif
+
// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8(
// CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8(
+// STREAM-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// STREAM-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_s810svint8x2_tu11__SVUint8_t(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_s810svint8x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// STREAM-CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
svint8_t test_svluti6_s8(svint8x2_t table, svuint8_t indices) {
return SVE_ACLE_FUNC(svluti6, _s8)(table, indices);
}
@@ -37,12 +59,24 @@ svint8_t test_svluti6_s8(svint8x2_t table, svuint8_t indices) {
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_u8(
+// STREAM-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// STREAM-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_u811svuint8x2_tu11__SVUint8_t(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_u811svuint8x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// STREAM-CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
svuint8_t test_svluti6_u8(svuint8x2_t table, svuint8_t indices) {
return SVE_ACLE_FUNC(svluti6, _u8)(table, indices);
}
@@ -53,12 +87,24 @@ svuint8_t test_svluti6_u8(svuint8x2_t table, svuint8_t indices) {
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_mf8(
+// STREAM-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// STREAM-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svluti6_mf813svmfloat8x2_tu11__SVUint8_t(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svluti6_mf813svmfloat8x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
+// STREAM-CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
svmfloat8_t test_svluti6_mf8(svmfloat8x2_t table, svuint8_t indices) {
return SVE_ACLE_FUNC(svluti6, _mf8)(table, indices);
}
@@ -69,13 +115,25 @@ svmfloat8_t test_svluti6_mf8(svmfloat8x2_t table, svuint8_t indices) {
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
//
-svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) {
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) STREAMING_ATTR {
return SVE_ACLE_FUNC(svluti6_lane, _s16_x2)(table, indices, 1);
}
@@ -85,13 +143,25 @@ svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) {
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
//
-svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) {
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) STREAMING_ATTR {
return SVE_ACLE_FUNC(svluti6_lane, _u16_x2)(table, indices, 0);
}
@@ -101,12 +171,52 @@ svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) {
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
//
-svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) {
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) STREAMING_ATTR {
return SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(table, indices, 1);
}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svluti6_lane_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svluti6_lane_bf16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z25test_svluti6_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z25test_svluti6_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svluti6_lane_bf16_x2(svbfloat16x2_t table, svuint8_t indices) STREAMING_ATTR {
+ return SVE_ACLE_FUNC(svluti6_lane, _bf16_x2)(table, indices, 0);
+}
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 05b386664a191..73ba293c5c9b4 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11395,6 +11395,11 @@ multiclass sve2_luti6_vector_index<string mnemonic> {
(nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
nxv8f16:$Op2, zsub1),
nxv16i8:$Op3, timm32_0_1:$Op4))>;
+ def : Pat<(nxv8bf16 (int_aarch64_sve_luti6_lane_x2 nxv8bf16:$Op1, nxv8bf16:$Op2,
+ nxv16i8:$Op3, (i32 timm32_0_1:$Op4))),
+ (nxv8bf16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0,
+ nxv8bf16:$Op2, zsub1),
+ nxv16i8:$Op3, timm32_0_1:$Op4))>;
}
// Look up table
diff --git a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
index 0777c1db532b1..7818dd19ffb1a 100644
--- a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
+++ b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: not opt -S -passes=verify < %s 2>&1 | FileCheck %s
define <vscale x 8 x i16> @bad_sve_luti6_ret(<vscale x 16 x i8> %a,
>From ffe55ff17fab2e5e59861e469c3d088b05c4e69e Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 23 Mar 2026 23:30:23 +0000
Subject: [PATCH 08/14] fixup! Address more PR comments
---
clang/include/clang/Basic/arm_sve.td | 5 +++-
clang/lib/Sema/SemaARM.cpp | 23 +++----------------
.../acle_sme2p3_target_lane.c | 2 +-
.../acle_sve2p3_target_lane.c | 9 --------
4 files changed, 8 insertions(+), 31 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index b47f6ab0aae57..b5947b071fe37 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1882,7 +1882,10 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
let SVETargetGuard = "sve2p3", SMETargetGuard = "sme2p3" in {
def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
- def SVLUTI6_x4 : SInst<"svluti6_lane[_{d}_x4]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
+}
+
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2p3" in {
+ def SVLUTI6_x4 : SInst<"svluti6_lane[_{d}_x4]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 16d73f65fea67..0d5376e786392 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -611,26 +611,9 @@ static bool checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
BuiltinType = SemaARM::ArmNonStreaming;
else if (SatisfiesSME)
BuiltinType = SemaARM::ArmStreaming;
- else {
- switch (BuiltinID) {
- case SVE::BI__builtin_sve_svluti6_lane_bf16_x4:
- case SVE::BI__builtin_sve_svluti6_lane_f16_x4:
- case SVE::BI__builtin_sve_svluti6_lane_s16_x4:
- case SVE::BI__builtin_sve_svluti6_lane_u16_x4: {
- std::string BuiltinName =
- std::string(S.Context.BuiltinInfo.getQuotedName(BuiltinID));
- const FunctionDecl *Callee = TheCall->getDirectCallee();
- if (Callee)
- BuiltinName = "'" + Callee->getName().str() + "'";
- S.Diag(TheCall->getBeginLoc(), diag::err_builtin_needs_feature)
- << BuiltinName << RequiredFeatures;
- return true;
- }
- default:
- // This should be diagnosed by CodeGen.
- return false;
- }
- }
+ else
+ // This should be diagnosed by CodeGen.
+ return false;
}
if (FnType != SemaARM::ArmNonStreaming &&
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
index 74bdbf8723fed..1a06663a9aab7 100644
--- a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
@@ -6,7 +6,7 @@
svbfloat16x4_t missing_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature (sve,sve2p3)|(sme,sme2p3)}}
+ return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature sme,sme2p3}}
}
__attribute__((target("sme2p3,bf16")))
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
index 2aec0c5daa039..6a2465f4027fc 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
@@ -12,12 +12,3 @@ __attribute__((target("sve2p3")))
svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
return svluti6_lane_f16_x2(table, indices, 0);
}
-
-svfloat16x4_t missing_sve2p3_luti6_lane_x4(svfloat16x2_t table, svuint8x2_t indices) {
- return svluti6_lane_f16_x4(table, indices, 1); // expected-error {{'svluti6_lane_f16_x4' needs target feature (sve,sve2p3)|(sme,sme2p3)}}
-}
-
-__attribute__((target("sve2p3")))
-svfloat16x4_t has_sve2p3_luti6_lane_x4(svfloat16x2_t table, svuint8x2_t indices) {
- return svluti6_lane_f16_x4(table, indices, 0);
-}
>From d5f3f73b2685a8eb51fdaebcb55a302f2b7b40af Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Wed, 8 Apr 2026 14:21:23 +0100
Subject: [PATCH 09/14] fixup! Fix PR comments
---
llvm/include/llvm/IR/IntrinsicsAArch64.td | 1 -
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 53 ++++-------
.../AArch64/sme2p3-intrinsics-luti6.ll | 94 +++++--------------
.../AArch64/sve2p3-intrinsics-luti6.ll | 48 +++-------
.../test/Verifier/AArch64/luti6-intrinsics.ll | 74 ++++-----------
5 files changed, 75 insertions(+), 195 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 01e21bc0622b4..51ea461d116ad 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1054,7 +1054,6 @@ def llvm_nxv4i1_ty : LLVMType<nxv4i1>;
def llvm_nxv8i1_ty : LLVMType<nxv8i1>;
def llvm_nxv16i1_ty : LLVMType<nxv16i1>;
def llvm_nxv16i8_ty : LLVMType<nxv16i8>;
-def llvm_nxv8i16_ty : LLVMType<nxv8i16>;
def llvm_nxv4i32_ty : LLVMType<nxv4i32>;
def llvm_nxv2i64_ty : LLVMType<nxv2i64>;
def llvm_nxv8f16_ty : LLVMType<nxv8f16>;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 142e177381be2..97888d489ac67 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -415,7 +415,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm);
void EmitMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
- ArrayRef<SDValue> Ops, bool HasChain);
+ ArrayRef<SDValue> Ops);
void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm);
@@ -2232,20 +2232,18 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
void AArch64DAGToDAGISel::EmitMultiVectorLutiLane(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc,
- ArrayRef<SDValue> Ops,
- bool HasChain) {
+ ArrayRef<SDValue> Ops) {
SDLoc DL(Node);
EVT VT = Node->getValueType(0);
+ bool HasChain = Node->getOpcode() == ISD::INTRINSIC_W_CHAIN;
SmallVector<SDValue, 4> MachineOps(Ops);
- SDNode *Instruction;
- if (HasChain) {
+ SmallVector<EVT, 2> ResTys = {MVT::Untyped};
+ if (HasChain)
MachineOps.push_back(Node->getOperand(0));
- Instruction =
- CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, MachineOps);
- } else {
- Instruction = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, MachineOps);
- }
+ if (HasChain)
+ ResTys.push_back(MVT::Other);
+ SDNode *Instruction = CurDAG->getMachineNode(Opc, DL, ResTys, MachineOps);
SDValue SuperReg(Instruction, 0);
for (unsigned i = 0; i < NumOutVecs; ++i)
@@ -2271,7 +2269,7 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
return;
SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
- EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops, /*HasChain=*/true);
+ EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops);
}
void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
@@ -2288,37 +2286,20 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
createZTuple({Node->getOperand(3), Node->getOperand(4)}),
Node->getOperand(5),
};
- EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops, /*HasChain=*/false);
+ EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops);
}
void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc,
unsigned NumInVecs) {
- const unsigned ChainOp = 0;
- const unsigned ZtOp = 2;
- const unsigned FirstVecOp = 3;
-
SDValue ZtValue;
- if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(ZtOp), ZtValue))
+ if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
return;
- SDValue ZTuple;
- switch (NumInVecs) {
- case 2:
- ZTuple = createZMulTuple(
- {Node->getOperand(FirstVecOp), Node->getOperand(FirstVecOp + 1)});
- break;
- case 3:
- ZTuple = createZTuple({Node->getOperand(FirstVecOp),
- Node->getOperand(FirstVecOp + 1),
- Node->getOperand(FirstVecOp + 2)});
- break;
- default:
- llvm_unreachable("unexpected LUTI ZT tuple width");
- }
-
- SDValue Ops[] = {ZtValue, ZTuple, Node->getOperand(ChainOp)};
+ SmallVector<SDValue, 4> Regs(Node->ops().slice(3, NumInVecs));
+ SDValue ZTuple = NumInVecs == 2 ? createZMulTuple(Regs) : createZTuple(Regs);
+ SDValue Ops[] = {ZtValue, ZTuple, Node->getOperand(0)};
SDLoc DL(Node);
EVT VT = Node->getValueType(0);
@@ -2327,9 +2308,9 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
SDValue SuperReg(Instruction, 0);
- for (unsigned i = 0; i < NumOutVecs; ++i)
- ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg(
- AArch64::zsub0 + i, DL, VT, SuperReg));
+ for (unsigned I = 0; I < NumOutVecs; ++I)
+ ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
+ AArch64::zsub0 + I, DL, VT, SuperReg));
ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1));
CurDAG->RemoveDeadNode(Node);
diff --git a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
index 07fb62baa58cd..95055414ef562 100644
--- a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
@@ -1,8 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -verify-machineinstrs -force-streaming -mtriple=aarch64-none-linux-gnu -mattr=+sme2p3 < %s | FileCheck %s
-target triple = "aarch64-none-linux-gnu"
-
define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 {
; CHECK-LABEL: luti6_zt_i8:
; CHECK: // %bb.0:
@@ -13,93 +11,49 @@ define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 {
ret <vscale x 16 x i8> %res
}
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
- <vscale x 16 x i8> } @luti6_zt_i8_x4(<vscale x 16 x i8> %a,
- <vscale x 16 x i8> %b,
- <vscale x 16 x i8> %c) #0 {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @luti6_zt_i8_x4(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) #0 {
; CHECK-LABEL: luti6_zt_i8_x4:
; CHECK: // %bb.0:
; CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z0 - z2 }
; CHECK-NEXT: ret
- %res = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>,
- <vscale x 16 x i8>, <vscale x 16 x i8> }
- @llvm.aarch64.sme.luti6.zt.x4(
- i32 0, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
- <vscale x 16 x i8> %c)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
- <vscale x 16 x i8> } %res
+ %res = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c)
+ ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
- <vscale x 8 x i16> } @luti6_i16_x4(<vscale x 8 x i16> %a,
- <vscale x 8 x i16> %b,
- <vscale x 16 x i8> %x,
- <vscale x 16 x i8> %y) #0 {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @luti6_i16_x4(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) #0 {
; CHECK-LABEL: luti6_i16_x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[1]
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: luti6 { z0.h - z3.h }, { z3.h, z4.h }, { z1, z2 }[1]
; CHECK-NEXT: ret
- %res = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>,
- <vscale x 8 x i16>, <vscale x 8 x i16> }
- @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
- <vscale x 8 x i16> %a, <vscale x 8 x i16> %b,
- <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
- <vscale x 8 x i16> } %res
+ %res = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
- <vscale x 8 x bfloat> } @luti6_bf16_x4(<vscale x 8 x bfloat> %a,
- <vscale x 8 x bfloat> %b,
- <vscale x 16 x i8> %x,
- <vscale x 16 x i8> %y) #0 {
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @luti6_bf16_x4(<vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b) #0 {
; CHECK-LABEL: luti6_bf16_x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[0]
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: luti6 { z0.h - z3.h }, { z3.h, z4.h }, { z1, z2 }[0]
; CHECK-NEXT: ret
- %res = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
- <vscale x 8 x bfloat>, <vscale x 8 x bfloat> }
- @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(
- <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b,
- <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 0)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
- <vscale x 8 x bfloat> } %res
+ %res = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 0)
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
}
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
- <vscale x 8 x half> } @luti6_f16_x4(<vscale x 8 x half> %a,
- <vscale x 8 x half> %b,
- <vscale x 16 x i8> %x,
- <vscale x 16 x i8> %y) #0 {
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @luti6_f16_x4(<vscale x 8 x half> %a, <vscale x 16 x i8> %b) #0 {
; CHECK-LABEL: luti6_f16_x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[1]
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: luti6 { z0.h - z3.h }, { z3.h, z4.h }, { z1, z2 }[1]
; CHECK-NEXT: ret
- %res = tail call { <vscale x 8 x half>, <vscale x 8 x half>,
- <vscale x 8 x half>, <vscale x 8 x half> }
- @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(
- <vscale x 8 x half> %a, <vscale x 8 x half> %b,
- <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1)
- ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
- <vscale x 8 x half> } %res
+ %res = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
+ ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
}
-declare <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(
- i32, <vscale x 16 x i8>)
-declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
- <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(
- i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
- <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
- <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>,
- <vscale x 16 x i8>, i32)
-declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
- <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(
- <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 16 x i8>,
- <vscale x 16 x i8>, i32)
-declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
- <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(
- <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>,
- <vscale x 16 x i8>, i32)
-
attributes #0 = { "target-features"="+sme2p3" }
diff --git a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
index ab89e87df66d2..73cec7a570061 100644
--- a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
@@ -1,55 +1,37 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve2p3 < %s | FileCheck %s
-target triple = "aarch64-none-linux-gnu"
-
-define <vscale x 16 x i8> @luti6_i8(<vscale x 16 x i8> %a,
+define <vscale x 16 x i8> @luti6_i8(<vscale x 16 x i8> %a) {
; CHECK-LABEL: luti6_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
-; CHECK-NEXT: luti6 z0.b, { z0.b, z1.b }, z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1
+; CHECK-NEXT: mov z1.d, z0.d
+; CHECK-NEXT: luti6 z0.b, { z0.b, z1.b }, z0
; CHECK-NEXT: ret
- <vscale x 16 x i8> %b,
- <vscale x 16 x i8> %idx) {
- %res = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(
- <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %idx)
+ %res = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
ret <vscale x 16 x i8> %res
}
define <vscale x 8 x i16> @luti6_i16_x2(<vscale x 8 x i16> %a,
; CHECK-LABEL: luti6_i16_x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
-; CHECK-NEXT: luti6 z0.h, { z0.h, z1.h }, z2[1]
+; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z1[1]
; CHECK-NEXT: ret
- <vscale x 8 x i16> %b,
- <vscale x 16 x i8> %idx) {
- %res = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
- <vscale x 8 x i16> %a, <vscale x 8 x i16> %b,
- <vscale x 16 x i8> %idx, i32 1)
+ <vscale x 16 x i8> %b) {
+ %res = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, i32 1)
ret <vscale x 8 x i16> %res
}
define <vscale x 8 x half> @luti6_f16_x2(<vscale x 8 x half> %a,
; CHECK-LABEL: luti6_f16_x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
-; CHECK-NEXT: luti6 z0.h, { z0.h, z1.h }, z2[0]
+; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z1[0]
; CHECK-NEXT: ret
- <vscale x 8 x half> %b,
- <vscale x 16 x i8> %idx) {
- %res = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
- <vscale x 8 x half> %a, <vscale x 8 x half> %b,
- <vscale x 16 x i8> %idx, i32 0)
+ <vscale x 16 x i8> %b) {
+ %res = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, i32 0)
ret <vscale x 8 x half> %res
}
-
-declare <vscale x 16 x i8> @llvm.aarch64.sve.luti6(
- <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
- <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32)
-declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
- <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>, i32)
diff --git a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
index 7818dd19ffb1a..9c5869e84f783 100644
--- a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
+++ b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
@@ -1,80 +1,44 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: not opt -S -passes=verify < %s 2>&1 | FileCheck %s
-define <vscale x 8 x i16> @bad_sve_luti6_ret(<vscale x 16 x i8> %a,
- <vscale x 16 x i8> %b,
- <vscale x 16 x i8> %idx) {
+define <vscale x 8 x i16> @bad_sve_luti6_ret(<vscale x 16 x i8> %a) {
; CHECK: Intrinsic has incorrect return type!
- %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6(
- <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %idx)
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
ret <vscale x 8 x i16> %res
}
-define <vscale x 8 x i16> @bad_sve_luti6_lane_x2_arg(<vscale x 4 x i32> %a,
- <vscale x 8 x i16> %b,
- <vscale x 16 x i8> %idx) {
+define <vscale x 8 x i16> @bad_sve_luti6_lane_x2_arg(<vscale x 4 x i32> %a, <vscale x 16 x i8> %b) {
; CHECK: Intrinsic has incorrect argument type!
- %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
- <vscale x 4 x i32> %a, <vscale x 8 x i16> %b,
- <vscale x 16 x i8> %idx, i32 1)
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 16 x i8> %b, i32 1)
ret <vscale x 8 x i16> %res
}
-define <vscale x 8 x half> @bad_sve_luti6_lane_x2_f16_arg(
- <vscale x 8 x i16> %a, <vscale x 8 x half> %b, <vscale x 16 x i8> %idx) {
+define <vscale x 8 x half> @bad_sve_luti6_lane_x2_f16_arg(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
; CHECK: Intrinsic has incorrect argument type!
- %res = call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
- <vscale x 8 x i16> %a, <vscale x 8 x half> %b,
- <vscale x 16 x i8> %idx, i32 1)
+ %res = call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, i32 1)
ret <vscale x 8 x half> %res
}
-define <vscale x 8 x i16> @bad_sme_luti6_zt_ret(i32 %zt,
- <vscale x 16 x i8> %idx) {
+define <vscale x 8 x i16> @bad_sme_luti6_zt_ret(i32 %zt, <vscale x 16 x i8> %idx) {
; CHECK: Intrinsic has incorrect return type!
- %res = call <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(
- i32 %zt, <vscale x 16 x i8> %idx)
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(i32 %zt, <vscale x 16 x i8> %idx)
ret <vscale x 8 x i16> %res
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
- <vscale x 8 x i16> } @bad_sme_luti6_zt_x4_ret(i32 %zt,
- <vscale x 16 x i8> %a,
- <vscale x 16 x i8> %b,
- <vscale x 16 x i8> %c) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @bad_sme_luti6_zt_x4_ret(i32 %zt, <vscale x 16 x i8> %a) {
; CHECK: Intrinsic has incorrect return type!
- %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
- <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(
- i32 %zt, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
- <vscale x 16 x i8> %c)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
- <vscale x 8 x i16> } %res
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(i32 %zt, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
- <vscale x 8 x i16> } @bad_sme_luti6_lane_x4_arg(
- <vscale x 8 x half> %a, <vscale x 8 x i16> %b,
- <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @bad_sme_luti6_lane_x4_arg(<vscale x 8 x half> %a, <vscale x 16 x i8> %b) {
; CHECK: Intrinsic has incorrect argument type!
- %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
- <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
- <vscale x 8 x half> %a, <vscale x 8 x i16> %b,
- <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
- <vscale x 8 x i16> } %res
+ %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
+ ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
}
-declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6(
- <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(
- <vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32)
-declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(
- <vscale x 8 x i16>, <vscale x 8 x half>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32)
declare <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(i32, <vscale x 16 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
- <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(
- i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
- <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(
- <vscale x 8 x half>, <vscale x 8 x i16>, <vscale x 16 x i8>,
- <vscale x 16 x i8>, i32)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
>From 3e1dfdcb497862831e3fda1628861e3de362da17 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 9 Apr 2026 14:29:21 +0100
Subject: [PATCH 10/14] fixup! More small PR fixes
---
clang/include/clang/Basic/arm_sve.td | 2 +-
clang/lib/Sema/SemaARM.cpp | 3 +-
.../acle_sve2p3_target_lane.c | 9 +++-
llvm/include/llvm/IR/IntrinsicsAArch64.td | 1 +
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 15 +++----
.../AArch64/sme2p3-intrinsics-luti6.ll | 5 +--
.../AArch64/sve2p3-intrinsics-luti6.ll | 13 +++---
.../test/Verifier/AArch64/luti6-intrinsics.ll | 44 -------------------
8 files changed, 23 insertions(+), 69 deletions(-)
delete mode 100644 llvm/test/Verifier/AArch64/luti6-intrinsics.ll
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index b5947b071fe37..97df8f7173086 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1880,7 +1880,7 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
def SVLUTI6 : SInst<"svluti6[_{d}]", "d2u", "cUcm", MergeNone, "aarch64_sve_luti6", [IsOverloadNone]>;
}
-let SVETargetGuard = "sve2p3", SMETargetGuard = "sme2p3" in {
+let SVETargetGuard = "sve2p3", SMETargetGuard = "sve2p3|sme2p3" in {
def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
}
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 0d5376e786392..e54c8228e5ff8 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -11,7 +11,6 @@
//===----------------------------------------------------------------------===//
#include "clang/Sema/SemaARM.h"
-#include "clang/Basic/DiagnosticFrontend.h"
#include "clang/Basic/DiagnosticSema.h"
#include "clang/Basic/TargetBuiltins.h"
#include "clang/Basic/TargetInfo.h"
@@ -612,7 +611,7 @@ static bool checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
else if (SatisfiesSME)
BuiltinType = SemaARM::ArmStreaming;
else
- // This should be diagnosed by CodeGen.
+ // This should be diagnosed by CodeGen
return false;
}
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
index 6a2465f4027fc..6c70379ce3da5 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
@@ -5,10 +5,17 @@
#include <arm_sve.h>
svfloat16_t missing_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
- return svluti6_lane_f16_x2(table, indices, 1); // expected-error {{'svluti6_lane_f16_x2' needs target feature (sve,sve2p3)|(sme,sme2p3)}}
+ return svluti6_lane_f16_x2(table, indices, 1); // expected-error {{'svluti6_lane_f16_x2' needs target feature (sve,sve2p3)|(sme,(sve2p3|sme2p3))}}
}
__attribute__((target("sve2p3")))
svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
return svluti6_lane_f16_x2(table, indices, 0);
}
+
+__attribute__((target("sve2p3,sme")))
+svfloat16_t has_streaming_sve2p3_luti6_lane(svfloat16x2_t table,
+ svuint8_t indices)
+ __arm_streaming {
+ return svluti6_lane_f16_x2(table, indices, 1);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 51ea461d116ad..87bffc8613fc3 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -4320,3 +4320,4 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sve_pmlal_pair_x2 : DefaultAttrsIntrinsic<[llvm_nxv2i64_ty, llvm_nxv2i64_ty],
[llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty, llvm_nxv2i64_ty], [IntrNoMem]>;
}
+
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 97888d489ac67..88b86f5dec024 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2237,13 +2237,10 @@ void AArch64DAGToDAGISel::EmitMultiVectorLutiLane(SDNode *Node,
EVT VT = Node->getValueType(0);
bool HasChain = Node->getOpcode() == ISD::INTRINSIC_W_CHAIN;
- SmallVector<SDValue, 4> MachineOps(Ops);
SmallVector<EVT, 2> ResTys = {MVT::Untyped};
- if (HasChain)
- MachineOps.push_back(Node->getOperand(0));
if (HasChain)
ResTys.push_back(MVT::Other);
- SDNode *Instruction = CurDAG->getMachineNode(Opc, DL, ResTys, MachineOps);
+ SDNode *Instruction = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
SDValue SuperReg(Instruction, 0);
for (unsigned i = 0; i < NumOutVecs; ++i)
@@ -2268,7 +2265,8 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
return;
- SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
+ SmallVector<SDValue, 4> Ops = {ZtValue, Node->getOperand(3),
+ Node->getOperand(4), Node->getOperand(0)};
EmitMultiVectorLutiLane(Node, NumOutVecs, Opc, Ops);
}
@@ -2276,10 +2274,9 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc,
uint32_t MaxImm) {
- SDValue ImmVal = Node->getOperand(5);
- if (auto *Imm = dyn_cast<ConstantSDNode>(ImmVal))
- if (Imm->getZExtValue() > MaxImm)
- return;
+ auto *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(5));
+ if (Imm && Imm->getZExtValue() > MaxImm)
+ return;
SmallVector<SDValue, 4> Ops = {
createZTuple({Node->getOperand(1), Node->getOperand(2)}),
diff --git a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
index 95055414ef562..7e6b9a280b254 100644
--- a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
@@ -6,8 +6,7 @@ define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: luti6 z0.b, zt0, z0
; CHECK-NEXT: ret
- %res = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(
- i32 0, <vscale x 16 x i8> %x)
+ %res = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> %x)
ret <vscale x 16 x i8> %res
}
@@ -55,5 +54,3 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
%res = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
}
-
-attributes #0 = { "target-features"="+sme2p3" }
diff --git a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
index 73cec7a570061..50b406a7e91b8 100644
--- a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
@@ -1,10 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve2p3 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve2p3 -enable-subreg-liveness < %s | FileCheck %s
define <vscale x 16 x i8> @luti6_i8(<vscale x 16 x i8> %a) {
; CHECK-LABEL: luti6_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z0 killed $z0 def $z0_z1
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: luti6 z0.b, { z0.b, z1.b }, z0
; CHECK-NEXT: ret
@@ -12,26 +11,24 @@ define <vscale x 16 x i8> @luti6_i8(<vscale x 16 x i8> %a) {
ret <vscale x 16 x i8> %res
}
-define <vscale x 8 x i16> @luti6_i16_x2(<vscale x 8 x i16> %a,
+define <vscale x 8 x i16> @luti6_i16_x2(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: luti6_i16_x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z1[1]
; CHECK-NEXT: ret
- <vscale x 16 x i8> %b) {
%res = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, i32 1)
ret <vscale x 8 x i16> %res
}
-define <vscale x 8 x half> @luti6_f16_x2(<vscale x 8 x half> %a,
+define <vscale x 8 x half> @luti6_f16_x2(<vscale x 8 x half> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: luti6_f16_x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z1[0]
; CHECK-NEXT: ret
- <vscale x 16 x i8> %b) {
%res = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, i32 0)
ret <vscale x 8 x half> %res
}
diff --git a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
deleted file mode 100644
index 9c5869e84f783..0000000000000
--- a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: not opt -S -passes=verify < %s 2>&1 | FileCheck %s
-
-define <vscale x 8 x i16> @bad_sve_luti6_ret(<vscale x 16 x i8> %a) {
-; CHECK: Intrinsic has incorrect return type!
- %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
- ret <vscale x 8 x i16> %res
-}
-
-define <vscale x 8 x i16> @bad_sve_luti6_lane_x2_arg(<vscale x 4 x i32> %a, <vscale x 16 x i8> %b) {
-; CHECK: Intrinsic has incorrect argument type!
- %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 16 x i8> %b, i32 1)
- ret <vscale x 8 x i16> %res
-}
-
-define <vscale x 8 x half> @bad_sve_luti6_lane_x2_f16_arg(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
-; CHECK: Intrinsic has incorrect argument type!
- %res = call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 16 x i8> %b, i32 1)
- ret <vscale x 8 x half> %res
-}
-
-define <vscale x 8 x i16> @bad_sme_luti6_zt_ret(i32 %zt, <vscale x 16 x i8> %idx) {
-; CHECK: Intrinsic has incorrect return type!
- %res = call <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(i32 %zt, <vscale x 16 x i8> %idx)
- ret <vscale x 8 x i16> %res
-}
-
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @bad_sme_luti6_zt_x4_ret(i32 %zt, <vscale x 16 x i8> %a) {
-; CHECK: Intrinsic has incorrect return type!
- %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(i32 %zt, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> %a)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
-}
-
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @bad_sme_luti6_lane_x4_arg(<vscale x 8 x half> %a, <vscale x 16 x i8> %b) {
-; CHECK: Intrinsic has incorrect argument type!
- %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %b, i32 1)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
-}
-
-declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 16 x i8>, i32)
-declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32)
-declare <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(i32, <vscale x 16 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
>From b2fb0e96d11fcd5c03cb4ea70b50e57af9eb2b21 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 9 Apr 2026 15:42:07 +0100
Subject: [PATCH 11/14] fixup! Fix more PR comments
---
clang/include/clang/Basic/arm_sme.td | 4 ++--
clang/include/clang/Basic/arm_sve.td | 2 +-
.../sme2p3-intrinsics/acle_sme2p3_luti6.c | 8 +++----
llvm/include/llvm/IR/IntrinsicsAArch64.td | 23 ++++++++-----------
4 files changed, 17 insertions(+), 20 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 678fa1efc2a51..c79e6e2ae1f9a 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -982,8 +982,8 @@ let SMETargetGuard = "sme-lutv2" in {
}
let SMETargetGuard = "sme2p3" in {
- def SVLUTI6_ZT : SInst<"svluti6_zt_{d}", "diu", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
- def SVLUTI6_ZT_X4 : SInst<"svluti6_zt_{d}_x4", "4i3.u", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
+ def SVLUTI6_ZT : SInst<"svluti6_zt_{d}", "di[", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
+ def SVLUTI6_ZT_X4 : SInst<"svluti6_zt_{d}_x4", "4i3.[", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
}
let SMETargetGuard = "sme-f8f32" in {
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 97df8f7173086..4527c89ed9535 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1877,7 +1877,7 @@ let SVETargetGuard = "(sve2|sme2),lut", SMETargetGuard = "sme2,lut" in {
}
let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in {
- def SVLUTI6 : SInst<"svluti6[_{d}]", "d2u", "cUcm", MergeNone, "aarch64_sve_luti6", [IsOverloadNone]>;
+ def SVLUTI6 : SInst<"svluti6[_{d}]", "d2[", "cUcm", MergeNone, "aarch64_sve_luti6", [IsOverloadNone]>;
}
let SVETargetGuard = "sve2p3", SMETargetGuard = "sve2p3|sme2p3" in {
diff --git a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
index 02dac71bb8de7..d9f3809207e6b 100644
--- a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
@@ -29,7 +29,7 @@
//
svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return SVE_ACLE_FUNC(svluti6_lane,_s16,_x4,)(table, indices, 1);
+ return SVE_ACLE_FUNC(svluti6_lane,_s16_x4,,)(table, indices, 1);
}
// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_u16_x4(
@@ -46,7 +46,7 @@ svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
//
svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return SVE_ACLE_FUNC(svluti6_lane,_u16,_x4,)(table, indices, 0);
+ return SVE_ACLE_FUNC(svluti6_lane,_u16_x4,,)(table, indices, 0);
}
// CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti6_lane_f16_x4(
@@ -63,7 +63,7 @@ svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
//
svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return SVE_ACLE_FUNC(svluti6_lane,_f16,_x4,)(table, indices, 1);
+ return SVE_ACLE_FUNC(svluti6_lane,_f16_x4,,)(table, indices, 1);
}
// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti6_lane_bf16_x4(
@@ -80,7 +80,7 @@ svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
//
svbfloat16x4_t test_svluti6_lane_bf16_x4(svbfloat16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return SVE_ACLE_FUNC(svluti6_lane,_bf16,_x4,)(table, indices, 0);
+ return SVE_ACLE_FUNC(svluti6_lane,_bf16_x4,,)(table, indices, 0);
}
// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_s8(
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 87bffc8613fc3..92280744c9b3b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1382,6 +1382,14 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
llvm_i32_ty],
!listconcat(Attrs, [IntrNoMem, ImmArg<ArgIndex<2>>])>;
+ class SVE2_LUTI_X2_Intrinsic<list<IntrinsicProperty> Attrs = []>
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>,
+ LLVMMatchType<0>,
+ llvm_nxv16i8_ty,
+ llvm_i32_ty],
+ !listconcat(Attrs, [IntrNoMem, ImmArg<ArgIndex<3>>])>;
+
class SVE2_1VectorArg_Long_Intrinsic<list<IntrinsicProperty> Attrs = []>
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMSubdivide2VectorType<0>,
@@ -2805,19 +2813,8 @@ def int_aarch64_sve_luti6 : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
llvm_nxv16i8_ty,
llvm_nxv16i8_ty],
[IntrNoMem, IntrSpeculatable]>;
-def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>,
- LLVMMatchType<0>,
- llvm_nxv16i8_ty,
- llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
-def int_aarch64_sve_luti6_lane_x2
- : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>,
- LLVMMatchType<0>,
- llvm_nxv16i8_ty,
- llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>;
+def int_aarch64_sve_luti4_lane_x2 : SVE2_LUTI_X2_Intrinsic<[IntrSpeculatable]>;
+def int_aarch64_sve_luti6_lane_x2 : SVE2_LUTI_X2_Intrinsic<[IntrSpeculatable]>;
//
// SVE2 - Optional bit permutation
>From 9c6c74dbaf47dbf18020832d4639f82c5cab0750 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Thu, 9 Apr 2026 16:24:17 +0100
Subject: [PATCH 12/14] fixup! Adjust `def`s and split out tests
---
clang/include/clang/Basic/arm_sme.td | 1 +
clang/include/clang/Basic/arm_sve.td | 4 -
.../sme2p3-intrinsics/acle_sme2p3_luti6.c | 10 +-
.../sve2p3-intrinsics/acle_sve2p3_luti6.c | 158 ------------------
.../acle_sve2p3_luti6_lane_x2.c | 138 +++++++++++++++
5 files changed, 144 insertions(+), 167 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6_lane_x2.c
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index c79e6e2ae1f9a..2c77510a71f6e 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -982,6 +982,7 @@ let SMETargetGuard = "sme-lutv2" in {
}
let SMETargetGuard = "sme2p3" in {
+ def SVLUTI6_X4 : SInst<"svluti6_lane_{d}_x4[_{1}_x2]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
def SVLUTI6_ZT : SInst<"svluti6_zt_{d}", "di[", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
def SVLUTI6_ZT_X4 : SInst<"svluti6_zt_{d}_x4", "4i3.[", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
}
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 4527c89ed9535..ea3c260319af7 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1884,10 +1884,6 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = "sve2p3|sme2p3" in {
def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
}
-let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2p3" in {
- def SVLUTI6_x4 : SInst<"svluti6_lane[_{d}_x4]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
-}
-
////////////////////////////////////////////////////////////////////////////////
// SVE2 - Optional
diff --git a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
index d9f3809207e6b..b6ef5226a9a82 100644
--- a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c
@@ -10,7 +10,7 @@
#include <arm_sme.h>
#ifdef SVE_OVERLOADED_FORMS
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3_UNUSED,A4_UNUSED) A1
+#define SVE_ACLE_FUNC(A1,A2,A3_UNUSED,A4_UNUSED) A1##A2
#else
#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
#endif
@@ -29,7 +29,7 @@
//
svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return SVE_ACLE_FUNC(svluti6_lane,_s16_x4,,)(table, indices, 1);
+ return SVE_ACLE_FUNC(svluti6_lane,_s16_x4,_s16_x2,)(table, indices, 1);
}
// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_u16_x4(
@@ -46,7 +46,7 @@ svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices)
//
svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return SVE_ACLE_FUNC(svluti6_lane,_u16_x4,,)(table, indices, 0);
+ return SVE_ACLE_FUNC(svluti6_lane,_u16_x4,_u16_x2,)(table, indices, 0);
}
// CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti6_lane_f16_x4(
@@ -63,7 +63,7 @@ svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices)
//
svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return SVE_ACLE_FUNC(svluti6_lane,_f16_x4,,)(table, indices, 1);
+ return SVE_ACLE_FUNC(svluti6_lane,_f16_x4,_f16_x2,)(table, indices, 1);
}
// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti6_lane_bf16_x4(
@@ -80,7 +80,7 @@ svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices)
//
svbfloat16x4_t test_svluti6_lane_bf16_x4(svbfloat16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return SVE_ACLE_FUNC(svluti6_lane,_bf16_x4,,)(table, indices, 0);
+ return SVE_ACLE_FUNC(svluti6_lane,_bf16_x4,_bf16_x2,)(table, indices, 0);
}
// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_s8(
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
index 59d24b641f9d5..5e3a65566ffd7 100644
--- a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c
@@ -5,10 +5,6 @@
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSTREAMING_MODE -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s --check-prefix=STREAM-CHECK
-// RUN: %clang_cc1 -DSTREAMING_MODE -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=STREAM-CPP-CHECK
-// RUN: %clang_cc1 -DSTREAMING_MODE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s --check-prefix=STREAM-CHECK
-// RUN: %clang_cc1 -DSTREAMING_MODE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=STREAM-CPP-CHECK
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
#include <arm_sve.h>
@@ -19,36 +15,18 @@
#define SVE_ACLE_FUNC(A1, A2) A1##A2
#endif
-#ifdef STREAMING_MODE
-#define STREAMING_ATTR __arm_streaming
-#else
-#define STREAMING_ATTR
-#endif
-
// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8(
// CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
//
-// STREAM-CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8(
-// STREAM-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
-// STREAM-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
-//
// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_s810svint8x2_tu11__SVUint8_t(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
//
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_s810svint8x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
-// STREAM-CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
-//
svint8_t test_svluti6_s8(svint8x2_t table, svuint8_t indices) {
return SVE_ACLE_FUNC(svluti6, _s8)(table, indices);
}
@@ -59,24 +37,12 @@ svint8_t test_svluti6_s8(svint8x2_t table, svuint8_t indices) {
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
//
-// STREAM-CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_u8(
-// STREAM-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
-// STREAM-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
-//
// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_u811svuint8x2_tu11__SVUint8_t(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
//
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_u811svuint8x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
-// STREAM-CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
-//
svuint8_t test_svluti6_u8(svuint8x2_t table, svuint8_t indices) {
return SVE_ACLE_FUNC(svluti6, _u8)(table, indices);
}
@@ -87,136 +53,12 @@ svuint8_t test_svluti6_u8(svuint8x2_t table, svuint8_t indices) {
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
//
-// STREAM-CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_mf8(
-// STREAM-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
-// STREAM-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
-//
// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svluti6_mf813svmfloat8x2_tu11__SVUint8_t(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
//
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svluti6_mf813svmfloat8x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]])
-// STREAM-CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
-//
svmfloat8_t test_svluti6_mf8(svmfloat8x2_t table, svuint8_t indices) {
return SVE_ACLE_FUNC(svluti6, _mf8)(table, indices);
}
-
-// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
-// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
-//
-// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
-// STREAM-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// STREAM-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
-//
-// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
-// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
-//
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// STREAM-CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
-//
-svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) STREAMING_ATTR {
- return SVE_ACLE_FUNC(svluti6_lane, _s16_x2)(table, indices, 1);
-}
-
-// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
-// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
-//
-// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
-// STREAM-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// STREAM-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
-//
-// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
-// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
-//
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// STREAM-CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
-//
-svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) STREAMING_ATTR {
- return SVE_ACLE_FUNC(svluti6_lane, _u16_x2)(table, indices, 0);
-}
-
-// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
-// CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
-//
-// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
-// STREAM-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// STREAM-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
-//
-// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
-// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
-//
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
-// STREAM-CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
-//
-svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) STREAMING_ATTR {
- return SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(table, indices, 1);
-}
-
-// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svluti6_lane_bf16_x2(
-// CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svluti6_lane_bf16_x2(
-// STREAM-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// STREAM-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z25test_svluti6_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
-// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
-//
-// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z25test_svluti6_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
-// STREAM-CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
-// STREAM-CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
-//
-svbfloat16_t test_svluti6_lane_bf16_x2(svbfloat16x2_t table, svuint8_t indices) STREAMING_ATTR {
- return SVE_ACLE_FUNC(svluti6_lane, _bf16_x2)(table, indices, 0);
-}
diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6_lane_x2.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6_lane_x2.c
new file mode 100644
index 0000000000000..b6d8fe5cff531
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6_lane_x2.c
@@ -0,0 +1,138 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s --check-prefix=STREAM-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=STREAM-CPP-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s --check-prefix=STREAM-CHECK
+// RUN: %clang_cc1 -DSTREAMING_MODE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +sme -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=STREAM-CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1, A2) A1##A2
+#endif
+
+#ifdef STREAMING_MODE
+#define STREAMING_ATTR __arm_streaming
+#else
+#define STREAMING_ATTR
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) STREAMING_ATTR {
+ return SVE_ACLE_FUNC(svluti6_lane, _s16_x2)(table, indices, 1);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
+// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) STREAMING_ATTR {
+ return SVE_ACLE_FUNC(svluti6_lane, _u16_x2)(table, indices, 0);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
+// CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1)
+// STREAM-CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) STREAMING_ATTR {
+ return SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(table, indices, 1);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svluti6_lane_bf16_x2(
+// CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// STREAM-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svluti6_lane_bf16_x2(
+// STREAM-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z25test_svluti6_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
+// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// STREAM-CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z25test_svluti6_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t(
+// STREAM-CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// STREAM-CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// STREAM-CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0)
+// STREAM-CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svluti6_lane_bf16_x2(svbfloat16x2_t table, svuint8_t indices) STREAMING_ATTR {
+ return SVE_ACLE_FUNC(svluti6_lane, _bf16_x2)(table, indices, 0);
+}
>From 9ea750837f16114a452a2835a371ba22cefc98f5 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 13 Apr 2026 13:56:43 +0100
Subject: [PATCH 13/14] fixup! Add some more _bf16 tests
---
.../acle_sve2p3_imm.cpp | 3 +++
.../acle_sve2p3_target_lane.c | 27 +++++++++++++++++++
.../AArch64/sve2p3-intrinsics-luti6.ll | 11 ++++++++
3 files changed, 41 insertions(+)
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp
index 8bbb0211b0bbb..6fca905f423ac 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp
@@ -21,4 +21,7 @@ void test_range_0_1() {
// expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(svcreate2_f16(svundef_f16(), svundef_f16()),
svundef_u8(), -1);
+ // expected-error-re at +1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}}
+ SVE_ACLE_FUNC(svluti6_lane, _bf16_x2)(svcreate2_bf16(svundef_bf16(), svundef_bf16()),
+ svundef_u8(), 2);
}
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
index 6c70379ce3da5..846e72d154c60 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
@@ -13,9 +13,36 @@ svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
return svluti6_lane_f16_x2(table, indices, 0);
}
+__attribute__((target("sve2p3,bf16")))
+svbfloat16_t has_sve2p3_luti6_lane_bf16(svbfloat16x2_t table,
+ svuint8_t indices) {
+ return svluti6_lane_bf16_x2(table, indices, 1);
+}
+
__attribute__((target("sve2p3,sme")))
svfloat16_t has_streaming_sve2p3_luti6_lane(svfloat16x2_t table,
svuint8_t indices)
__arm_streaming {
return svluti6_lane_f16_x2(table, indices, 1);
}
+
+__attribute__((target("sve2p3,sme,bf16")))
+svbfloat16_t has_streaming_sve2p3_luti6_lane_bf16(svbfloat16x2_t table,
+ svuint8_t indices)
+ __arm_streaming {
+ return svluti6_lane_bf16_x2(table, indices, 0);
+}
+
+__attribute__((target("sme2p3,sme")))
+svfloat16_t has_streaming_sme2p3_luti6_lane(svfloat16x2_t table,
+ svuint8_t indices)
+ __arm_streaming {
+ return svluti6_lane_f16_x2(table, indices, 0);
+}
+
+__attribute__((target("sme2p3,sme,bf16")))
+svbfloat16_t has_streaming_sme2p3_luti6_lane_bf16(svbfloat16x2_t table,
+ svuint8_t indices)
+ __arm_streaming {
+ return svluti6_lane_bf16_x2(table, indices, 1);
+}
diff --git a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
index 50b406a7e91b8..a2bf43088968f 100644
--- a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll
@@ -32,3 +32,14 @@ define <vscale x 8 x half> @luti6_f16_x2(<vscale x 8 x half> %a, <vscale x 16 x
%res = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> %a, <vscale x 8 x half> %a, <vscale x 16 x i8> %b, i32 0)
ret <vscale x 8 x half> %res
}
+
+define <vscale x 8 x bfloat> @luti6_bf16_x2(<vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: luti6_bf16_x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z1[1]
+; CHECK-NEXT: ret
+ %res = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti6.lane.x2.bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b, i32 1)
+ ret <vscale x 8 x bfloat> %res
+}
>From 39bb86ec1611f02b7a949e8cd2429574c9095636 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Tue, 14 Apr 2026 13:16:43 +0100
Subject: [PATCH 14/14] fixup! Address more PR comments
---
clang/include/clang/Basic/arm_sme.td | 1 -
clang/include/clang/Basic/arm_sve.td | 4 ++++
.../acle_sme2p3_target.c | 6 ++---
.../acle_sme2p3_target_lane.c | 16 -------------
.../acle_sve2p3_target_lane.c | 24 ++++++++++++-------
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 5 +---
.../AArch64/sme2p3-intrinsics-luti6.ll | 10 ++++----
7 files changed, 28 insertions(+), 38 deletions(-)
delete mode 100644 clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 2c77510a71f6e..c79e6e2ae1f9a 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -982,7 +982,6 @@ let SMETargetGuard = "sme-lutv2" in {
}
let SMETargetGuard = "sme2p3" in {
- def SVLUTI6_X4 : SInst<"svluti6_lane_{d}_x4[_{1}_x2]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
def SVLUTI6_ZT : SInst<"svluti6_zt_{d}", "di[", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
def SVLUTI6_ZT_X4 : SInst<"svluti6_zt_{d}_x4", "4i3.[", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
}
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index ea3c260319af7..c7b97f57638c0 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1884,6 +1884,10 @@ let SVETargetGuard = "sve2p3", SMETargetGuard = "sve2p3|sme2p3" in {
def SVLUTI6_x2 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti6_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>;
}
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2p3" in {
+ def SVLUTI6_X4 : SInst<"svluti6_lane_{d}_x4[_{1}_x2]", "422.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>;
+}
+
////////////////////////////////////////////////////////////////////////////////
// SVE2 - Optional
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
index 2cffc1344bfe1..52ed761c05897 100644
--- a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
+++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c
@@ -13,8 +13,8 @@ svint8_t has_sme2p3_zt(svuint8_t indices) __arm_streaming __arm_in("zt0") {
return svluti6_zt_s8(0, indices);
}
-__attribute__((target("sme2p3")))
-svfloat16_t has_sme2p3_implied_sme2p2(svbool_t pg, svfloat16_t op)
+__attribute__((target("sme2p3,bf16")))
+svbfloat16x4_t has_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
__arm_streaming {
- return svcompact_f16(pg, op);
+ return svluti6_lane_bf16_x4(table, indices, 0);
}
diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
deleted file mode 100644
index 1a06663a9aab7..0000000000000
--- a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c
+++ /dev/null
@@ -1,16 +0,0 @@
-// REQUIRES: aarch64-registered-target
-
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +bf16 -verify -emit-llvm -o - %s
-
-#include <arm_sme.h>
-
-svbfloat16x4_t missing_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
- __arm_streaming {
- return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature sme,sme2p3}}
-}
-
-__attribute__((target("sme2p3,bf16")))
-svbfloat16x4_t has_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices)
- __arm_streaming {
- return svluti6_lane_bf16_x4(table, indices, 0);
-}
diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
index 846e72d154c60..139b240919bb8 100644
--- a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
+++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c
@@ -14,9 +14,11 @@ svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) {
}
__attribute__((target("sve2p3,bf16")))
-svbfloat16_t has_sve2p3_luti6_lane_bf16(svbfloat16x2_t table,
- svuint8_t indices) {
- return svluti6_lane_bf16_x2(table, indices, 1);
+void has_sve2p3_luti6_lane_bf16(svfloat16x2_t f16_table,
+ svbfloat16x2_t bf16_table,
+ svuint8_t indices) {
+ (void)svluti6_lane_f16_x2(f16_table, indices, 0);
+ (void)svluti6_lane_bf16_x2(bf16_table, indices, 1);
}
__attribute__((target("sve2p3,sme")))
@@ -27,10 +29,12 @@ svfloat16_t has_streaming_sve2p3_luti6_lane(svfloat16x2_t table,
}
__attribute__((target("sve2p3,sme,bf16")))
-svbfloat16_t has_streaming_sve2p3_luti6_lane_bf16(svbfloat16x2_t table,
- svuint8_t indices)
+void has_streaming_sve2p3_luti6_lane_bf16(svfloat16x2_t f16_table,
+ svbfloat16x2_t bf16_table,
+ svuint8_t indices)
__arm_streaming {
- return svluti6_lane_bf16_x2(table, indices, 0);
+ (void)svluti6_lane_f16_x2(f16_table, indices, 1);
+ (void)svluti6_lane_bf16_x2(bf16_table, indices, 0);
}
__attribute__((target("sme2p3,sme")))
@@ -41,8 +45,10 @@ svfloat16_t has_streaming_sme2p3_luti6_lane(svfloat16x2_t table,
}
__attribute__((target("sme2p3,sme,bf16")))
-svbfloat16_t has_streaming_sme2p3_luti6_lane_bf16(svbfloat16x2_t table,
- svuint8_t indices)
+void has_streaming_sme2p3_luti6_lane_bf16(svfloat16x2_t f16_table,
+ svbfloat16x2_t bf16_table,
+ svuint8_t indices)
__arm_streaming {
- return svluti6_lane_bf16_x2(table, indices, 1);
+ (void)svluti6_lane_f16_x2(f16_table, indices, 0);
+ (void)svluti6_lane_bf16_x2(bf16_table, indices, 1);
}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index f547038a1353b..6e942ad6267ec 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4840,13 +4840,10 @@ let Predicates = [HasSVE2p3_or_SME2p3] in {
defm SQSHRUN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"sqshrun", 0b100, null_frag>;
defm SQSHRN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"sqshrn", 0b000, null_frag>;
defm UQSHRN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"uqshrn", 0b010, null_frag>;
+ defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">;
} // End HasSME2p3orSVE2p3
-let Predicates = [HasSVE2p3_or_SME2p3] in {
- defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">;
-}
-
//===----------------------------------------------------------------------===//
// SVE2.3 instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
index 7e6b9a280b254..8cf13f7f0cd71 100644
--- a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
+++ b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -verify-machineinstrs -force-streaming -mtriple=aarch64-none-linux-gnu -mattr=+sme2p3 < %s | FileCheck %s
-define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 {
+define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti6_zt_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: luti6 z0.b, zt0, z0
@@ -10,7 +10,7 @@ define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 {
ret <vscale x 16 x i8> %res
}
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @luti6_zt_i8_x4(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) #0 {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @luti6_zt_i8_x4(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) {
; CHECK-LABEL: luti6_zt_i8_x4:
; CHECK: // %bb.0:
; CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z0 - z2 }
@@ -19,7 +19,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @luti6_i16_x4(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) #0 {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @luti6_i16_x4(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: luti6_i16_x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z3.d, z0.d
@@ -31,7 +31,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @luti6_bf16_x4(<vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b) #0 {
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @luti6_bf16_x4(<vscale x 8 x bfloat> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: luti6_bf16_x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z3.d, z0.d
@@ -43,7 +43,7 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <v
ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
}
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @luti6_f16_x4(<vscale x 8 x half> %a, <vscale x 16 x i8> %b) #0 {
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @luti6_f16_x4(<vscale x 8 x half> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: luti6_f16_x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z3.d, z0.d
More information about the cfe-commits
mailing list