[clang] 3e948eb - [AArch64][NEON] Add intrinsics for LUTI (#96883)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Sep 4 02:40:02 PDT 2024
Author: Lukacma
Date: 2024-09-04T10:39:59+01:00
New Revision: 3e948eb3e88d89107406ca0812934bea42101e3a
URL: https://github.com/llvm/llvm-project/commit/3e948eb3e88d89107406ca0812934bea42101e3a
DIFF: https://github.com/llvm/llvm-project/commit/3e948eb3e88d89107406ca0812934bea42101e3a.diff
LOG: [AArch64][NEON] Add intrinsics for LUTI (#96883)
This patch adds intrinsics for NEON LUTI2 and LUTI4 instructions as
specified in the [ACLE
proposal](https://github.com/ARM-software/acle/pull/324)
Added:
clang/test/CodeGen/aarch64-neon-luti.c
llvm/test/CodeGen/AArch64/neon-luti.ll
Modified:
clang/include/clang/Basic/arm_neon.td
clang/lib/CodeGen/CGBuiltin.cpp
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64InstrFormats.td
llvm/lib/Target/AArch64/AArch64InstrInfo.td
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 3098fa67e6a512..536c0652280b9d 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2096,3 +2096,22 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "r
def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
def VSTL1_LANE : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
}
+
+// Lookup table read with 2-bit/4-bit indices
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
+ def VLUTI2_B : SInst<"vluti2_lane", "Q.(qU)I", "cUcPcQcQUcQPc">;
+ def VLUTI2_B_Q : SInst<"vluti2_laneq", "Q.(QU)I", "cUcPcQcQUcQPc">;
+ def VLUTI2_H : SInst<"vluti2_lane", "Q.(<qU)I", "sUsPshQsQUsQPsQh">;
+ def VLUTI2_H_Q : SInst<"vluti2_laneq", "Q.(<QU)I", "sUsPshQsQUsQPsQh">;
+ def VLUTI4_B : SInst<"vluti4_lane", "..(qU)I", "QcQUcQPc">;
+ def VLUTI4_B_Q : SInst<"vluti4_laneq", "..UI", "QcQUcQPc">;
+ def VLUTI4_H_X2 : SInst<"vluti4_lane_x2", ".2(<qU)I", "QsQUsQPsQh">;
+ def VLUTI4_H_X2_Q : SInst<"vluti4_laneq_x2", ".2(<U)I", "QsQUsQPsQh">;
+
+ let ArchGuard = "defined(__aarch64__)", TargetGuard= "lut,bf16" in {
+ def VLUTI2_BF : SInst<"vluti2_lane", "Q.(<qU)I", "bQb">;
+ def VLUTI2_BF_Q : SInst<"vluti2_laneq", "Q.(<QU)I", "bQb">;
+ def VLUTI4_BF_X2 : SInst<"vluti4_lane_x2", ".2(<qU)I", "Qb">;
+ def VLUTI4_BF_X2_Q : SInst<"vluti4_laneq_x2", ".2(<U)I", "Qb">;
+ }
+}
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 012b43b8770bef..e826c1c6fbbd23 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -13481,6 +13481,95 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Int = Intrinsic::aarch64_neon_suqadd;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
}
+
+ case NEON::BI__builtin_neon_vluti2_laneq_bf16:
+ case NEON::BI__builtin_neon_vluti2_laneq_f16:
+ case NEON::BI__builtin_neon_vluti2_laneq_p16:
+ case NEON::BI__builtin_neon_vluti2_laneq_p8:
+ case NEON::BI__builtin_neon_vluti2_laneq_s16:
+ case NEON::BI__builtin_neon_vluti2_laneq_s8:
+ case NEON::BI__builtin_neon_vluti2_laneq_u16:
+ case NEON::BI__builtin_neon_vluti2_laneq_u8: {
+ Int = Intrinsic::aarch64_neon_vluti2_laneq;
+ llvm::Type *Tys[2];
+ Tys[0] = Ty;
+ Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+ /*isQuad*/ false));
+ return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq");
+ }
+ case NEON::BI__builtin_neon_vluti2q_laneq_bf16:
+ case NEON::BI__builtin_neon_vluti2q_laneq_f16:
+ case NEON::BI__builtin_neon_vluti2q_laneq_p16:
+ case NEON::BI__builtin_neon_vluti2q_laneq_p8:
+ case NEON::BI__builtin_neon_vluti2q_laneq_s16:
+ case NEON::BI__builtin_neon_vluti2q_laneq_s8:
+ case NEON::BI__builtin_neon_vluti2q_laneq_u16:
+ case NEON::BI__builtin_neon_vluti2q_laneq_u8: {
+ Int = Intrinsic::aarch64_neon_vluti2_laneq;
+ llvm::Type *Tys[2];
+ Tys[0] = Ty;
+ Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+ /*isQuad*/ true));
+ return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq");
+ }
+ case NEON::BI__builtin_neon_vluti2_lane_bf16:
+ case NEON::BI__builtin_neon_vluti2_lane_f16:
+ case NEON::BI__builtin_neon_vluti2_lane_p16:
+ case NEON::BI__builtin_neon_vluti2_lane_p8:
+ case NEON::BI__builtin_neon_vluti2_lane_s16:
+ case NEON::BI__builtin_neon_vluti2_lane_s8:
+ case NEON::BI__builtin_neon_vluti2_lane_u16:
+ case NEON::BI__builtin_neon_vluti2_lane_u8: {
+ Int = Intrinsic::aarch64_neon_vluti2_lane;
+ llvm::Type *Tys[2];
+ Tys[0] = Ty;
+ Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+ /*isQuad*/ false));
+ return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane");
+ }
+ case NEON::BI__builtin_neon_vluti2q_lane_bf16:
+ case NEON::BI__builtin_neon_vluti2q_lane_f16:
+ case NEON::BI__builtin_neon_vluti2q_lane_p16:
+ case NEON::BI__builtin_neon_vluti2q_lane_p8:
+ case NEON::BI__builtin_neon_vluti2q_lane_s16:
+ case NEON::BI__builtin_neon_vluti2q_lane_s8:
+ case NEON::BI__builtin_neon_vluti2q_lane_u16:
+ case NEON::BI__builtin_neon_vluti2q_lane_u8: {
+ Int = Intrinsic::aarch64_neon_vluti2_lane;
+ llvm::Type *Tys[2];
+ Tys[0] = Ty;
+ Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+ /*isQuad*/ true));
+ return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane");
+ }
+ case NEON::BI__builtin_neon_vluti4q_lane_p8:
+ case NEON::BI__builtin_neon_vluti4q_lane_s8:
+ case NEON::BI__builtin_neon_vluti4q_lane_u8: {
+ Int = Intrinsic::aarch64_neon_vluti4q_lane;
+ return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_lane");
+ }
+ case NEON::BI__builtin_neon_vluti4q_laneq_p8:
+ case NEON::BI__builtin_neon_vluti4q_laneq_s8:
+ case NEON::BI__builtin_neon_vluti4q_laneq_u8: {
+ Int = Intrinsic::aarch64_neon_vluti4q_laneq;
+ return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq");
+ }
+ case NEON::BI__builtin_neon_vluti4q_lane_bf16_x2:
+ case NEON::BI__builtin_neon_vluti4q_lane_f16_x2:
+ case NEON::BI__builtin_neon_vluti4q_lane_p16_x2:
+ case NEON::BI__builtin_neon_vluti4q_lane_s16_x2:
+ case NEON::BI__builtin_neon_vluti4q_lane_u16_x2: {
+ Int = Intrinsic::aarch64_neon_vluti4q_lane_x2;
+ return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_lane_x2");
+ }
+ case NEON::BI__builtin_neon_vluti4q_laneq_bf16_x2:
+ case NEON::BI__builtin_neon_vluti4q_laneq_f16_x2:
+ case NEON::BI__builtin_neon_vluti4q_laneq_p16_x2:
+ case NEON::BI__builtin_neon_vluti4q_laneq_s16_x2:
+ case NEON::BI__builtin_neon_vluti4q_laneq_u16_x2: {
+ Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
+ return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
+ }
}
}
diff --git a/clang/test/CodeGen/aarch64-neon-luti.c b/clang/test/CodeGen/aarch64-neon-luti.c
new file mode 100644
index 00000000000000..72cb6bcdb40f08
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-neon-luti.c
@@ -0,0 +1,506 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+#include <arm_neon.h>
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_u8(
+// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]]
+//
+uint8x16_t test_vluti2_lane_u8(uint8x8_t vn, uint8x8_t vm) {
+ return vluti2_lane_u8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_u8(
+// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]]
+//
+uint8x16_t test_vluti2_laneq_u8(uint8x8_t vn, uint8x16_t vm) {
+ return vluti2_laneq_u8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_u8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]]
+//
+uint8x16_t test_vluti2q_lane_u8(uint8x16_t vn, uint8x8_t vm) {
+ return vluti2q_lane_u8(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_u8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]]
+//
+uint8x16_t test_vluti2q_laneq_u8(uint8x16_t vn, uint8x16_t vm) {
+ return vluti2q_laneq_u8(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_s8(
+// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]]
+//
+int8x16_t test_vluti2_lane_s8(int8x8_t vn, uint8x8_t vm) {
+ return vluti2_lane_s8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_s8(
+// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]]
+//
+int8x16_t test_vluti2_laneq_s8(int8x8_t vn, uint8x16_t vm) {
+ return vluti2_laneq_s8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_s8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]]
+//
+int8x16_t test_vluti2q_lane_s8(int8x16_t vn, uint8x8_t vm) {
+ return vluti2q_lane_s8(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_s8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]]
+//
+int8x16_t test_vluti2q_laneq_s8(int8x16_t vn, uint8x16_t vm) {
+ return vluti2q_laneq_s8(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_p8(
+// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]]
+//
+poly8x16_t test_vluti2_lane_p8(poly8x8_t vn, uint8x8_t vm) {
+ return vluti2_lane_p8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_p8(
+// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]]
+//
+poly8x16_t test_vluti2_laneq_p8(poly8x8_t vn, uint8x16_t vm) {
+ return vluti2_laneq_p8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_p8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]]
+//
+poly8x16_t test_vluti2q_lane_p8(poly8x16_t vn, uint8x8_t vm) {
+ return vluti2q_lane_p8(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_p8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]]
+//
+poly8x16_t test_vluti2q_laneq_p8(poly8x16_t vn, uint8x16_t vm) {
+ return vluti2q_laneq_p8(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_u16(
+// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16(<4 x i16> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE1]]
+//
+uint16x8_t test_vluti2_lane_u16(uint16x4_t vn, uint8x8_t vm) {
+ return vluti2_lane_u16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_laneq_u16(
+// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v4i16(<4 x i16> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANEQ1]]
+//
+uint16x8_t test_vluti2_laneq_u16(uint16x4_t vn, uint8x16_t vm) {
+ return vluti2_laneq_u16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_lane_u16(
+// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16(<8 x i16> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE1]]
+//
+uint16x8_t test_vluti2q_lane_u16(uint16x8_t vn, uint8x8_t vm) {
+ return vluti2q_lane_u16(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_laneq_u16(
+// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v8i16(<8 x i16> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANEQ1]]
+//
+uint16x8_t test_vluti2q_laneq_u16(uint16x8_t vn, uint8x16_t vm) {
+ return vluti2q_laneq_u16(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_s16(
+// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16(<4 x i16> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE1]]
+//
+int16x8_t test_vluti2_lane_s16(int16x4_t vn, uint8x8_t vm) {
+ return vluti2_lane_s16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_laneq_s16(
+// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v4i16(<4 x i16> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANEQ1]]
+//
+int16x8_t test_vluti2_laneq_s16(int16x4_t vn, uint8x16_t vm) {
+ return vluti2_laneq_s16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_lane_s16(
+// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16(<8 x i16> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE1]]
+//
+int16x8_t test_vluti2q_lane_s16(int16x8_t vn, uint8x8_t vm) {
+ return vluti2q_lane_s16(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_laneq_s16(
+// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v8i16(<8 x i16> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANEQ1]]
+//
+int16x8_t test_vluti2q_laneq_s16(int16x8_t vn, uint8x16_t vm) {
+ return vluti2q_laneq_s16(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vluti2_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v4f16(<4 x half> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x half> [[VLUTI2_LANE1]]
+//
+float16x8_t test_vluti2_lane_f16(float16x4_t vn, uint8x8_t vm) {
+ return vluti2_lane_f16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vluti2_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.laneq.v8f16.v4f16(<4 x half> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x half> [[VLUTI2_LANEQ1]]
+//
+float16x8_t test_vluti2_laneq_f16(float16x4_t vn, uint8x16_t vm) {
+ return vluti2_laneq_f16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vluti2q_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v8f16(<8 x half> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT: ret <8 x half> [[VLUTI2_LANE1]]
+//
+float16x8_t test_vluti2q_lane_f16(float16x8_t vn, uint8x8_t vm) {
+ return vluti2q_lane_f16(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vluti2q_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.laneq.v8f16.v8f16(<8 x half> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT: ret <8 x half> [[VLUTI2_LANEQ1]]
+//
+float16x8_t test_vluti2q_laneq_f16(float16x8_t vn, uint8x16_t vm) {
+ return vluti2q_laneq_f16(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2_lane_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v4bf16(<4 x bfloat> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x bfloat> [[VLUTI2_LANE1]]
+//
+bfloat16x8_t test_vluti2_lane_bf16(bfloat16x4_t vn, uint8x8_t vm) {
+ return vluti2_lane_bf16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2_laneq_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.laneq.v8bf16.v4bf16(<4 x bfloat> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x bfloat> [[VLUTI2_LANEQ1]]
+//
+bfloat16x8_t test_vluti2_laneq_bf16(bfloat16x4_t vn, uint8x16_t vm) {
+ return vluti2_laneq_bf16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2q_lane_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v8bf16(<8 x bfloat> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT: ret <8 x bfloat> [[VLUTI2_LANE1]]
+//
+bfloat16x8_t test_vluti2q_lane_bf16(bfloat16x8_t vn, uint8x8_t vm) {
+ return vluti2q_lane_bf16(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2q_laneq_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.laneq.v8bf16.v8bf16(<8 x bfloat> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT: ret <8 x bfloat> [[VLUTI2_LANEQ1]]
+//
+bfloat16x8_t test_vluti2q_laneq_bf16(bfloat16x8_t vn, uint8x16_t vm) {
+ return vluti2q_laneq_bf16(vn, vm, 7);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_p16(
+// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16(<4 x i16> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE1]]
+//
+poly16x8_t test_vluti2_lane_p16(poly16x4_t vn, uint8x8_t vm) {
+ return vluti2_lane_p16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_laneq_p16(
+// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v4i16(<4 x i16> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANEQ1]]
+//
+poly16x8_t test_vluti2_laneq_p16(poly16x4_t vn, uint8x16_t vm) {
+ return vluti2_laneq_p16(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_lane_p16(
+// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANE1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16(<8 x i16> [[VN]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE1]]
+//
+poly16x8_t test_vluti2q_lane_p16(poly16x8_t vn, uint8x8_t vm) {
+ return vluti2q_lane_p16(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_laneq_p16(
+// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI2_LANEQ1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v8i16(<8 x i16> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANEQ1]]
+//
+poly16x8_t test_vluti2q_laneq_p16(poly16x8_t vn, uint8x16_t vm) {
+ return vluti2q_laneq_p16(vn, vm, 7);
+}
+
+//
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_u8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANE]]
+//
+uint8x16_t test_vluti4q_lane_u8(uint8x16_t vn, uint8x8_t vm) {
+ return vluti4q_lane_u8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_u8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI4Q_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANEQ]]
+//
+uint8x16_t test_vluti4q_laneq_u8(uint8x16_t vn, uint8x16_t vm) {
+ return vluti4q_laneq_u8(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_s8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANE]]
+//
+int8x16_t test_vluti4q_lane_s8(int8x16_t vn, uint8x8_t vm) {
+ return vluti4q_lane_s8(vn, vm, 1);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_s8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI4Q_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 1)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANEQ]]
+//
+int8x16_t test_vluti4q_laneq_s8(int8x16_t vn, uint8x16_t vm) {
+ return vluti4q_laneq_s8(vn, vm, 1);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_p8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI4Q_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANE]]
+//
+poly8x16_t test_vluti4q_lane_p8(poly8x16_t vn, uint8x8_t vm) {
+ return vluti4q_lane_p8(vn, vm, 1);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_p8(
+// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VLUTI4Q_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 1)
+// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANEQ]]
+//
+poly8x16_t test_vluti4q_laneq_p8(poly8x16_t vn, uint8x16_t vm) {
+ return vluti4q_laneq_p8(vn, vm, 1);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_lane_u16_x2(
+// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
+// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
+// CHECK-NEXT: [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANE_X24]]
+//
+uint16x8_t test_vluti4q_lane_u16_x2(uint16x8x2_t vn, uint8x8_t vm) {
+ return vluti4q_lane_u16_x2(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_u16_x2(
+// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
+// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
+// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANEQ_X24]]
+//
+uint16x8_t test_vluti4q_laneq_u16_x2(uint16x8x2_t vn, uint8x16_t vm) {
+ return vluti4q_laneq_u16_x2(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_lane_s16_x2(
+// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
+// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
+// CHECK-NEXT: [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 3)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANE_X24]]
+//
+int16x8_t test_vluti4q_lane_s16_x2(int16x8x2_t vn, uint8x8_t vm) {
+ return vluti4q_lane_s16_x2(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_s16_x2(
+// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
+// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
+// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 3)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANEQ_X24]]
+//
+int16x8_t test_vluti4q_laneq_s16_x2(int16x8x2_t vn, uint8x16_t vm) {
+ return vluti4q_laneq_s16_x2(vn, vm, 3);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vluti4q_lane_f16_x2(
+// CHECK-SAME: [2 x <8 x half>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 0
+// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 1
+// CHECK-NEXT: [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti4q.lane.x2.v8f16(<8 x half> [[VN_COERCE_FCA_0_EXTRACT]], <8 x half> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 1)
+// CHECK-NEXT: ret <8 x half> [[VLUTI4Q_LANE_X24]]
+//
+float16x8_t test_vluti4q_lane_f16_x2(float16x8x2_t vn, uint8x8_t vm) {
+ return vluti4q_lane_f16_x2(vn, vm, 1);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vluti4q_laneq_f16_x2(
+// CHECK-SAME: [2 x <8 x half>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 0
+// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 1
+// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti4q.laneq.x2.v8f16(<8 x half> [[VN_COERCE_FCA_0_EXTRACT]], <8 x half> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 1)
+// CHECK-NEXT: ret <8 x half> [[VLUTI4Q_LANEQ_X24]]
+//
+float16x8_t test_vluti4q_laneq_f16_x2(float16x8x2_t vn, uint8x16_t vm) {
+ return vluti4q_laneq_f16_x2(vn, vm, 1);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti4q_lane_bf16_x2(
+// CHECK-SAME: [2 x <8 x bfloat>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 0
+// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 1
+// CHECK-NEXT: [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.lane.x2.v8bf16(<8 x bfloat> [[VN_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 2)
+// CHECK-NEXT: ret <8 x bfloat> [[VLUTI4Q_LANE_X24]]
+//
+bfloat16x8_t test_vluti4q_lane_bf16_x2(bfloat16x8x2_t vn, uint8x8_t vm) {
+ return vluti4q_lane_bf16_x2(vn, vm, 2);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti4q_laneq_bf16_x2(
+// CHECK-SAME: [2 x <8 x bfloat>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 0
+// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 1
+// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.laneq.x2.v8bf16(<8 x bfloat> [[VN_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 2)
+// CHECK-NEXT: ret <8 x bfloat> [[VLUTI4Q_LANEQ_X24]]
+//
+bfloat16x8_t test_vluti4q_laneq_bf16_x2(bfloat16x8x2_t vn, uint8x16_t vm) {
+ return vluti4q_laneq_bf16_x2(vn, vm, 2);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_lane_p16_x2(
+// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
+// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
+// CHECK-NEXT: [[VLUTI4Q_LANE_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANE_X24]]
+//
+poly16x8_t test_vluti4q_lane_p16_x2(poly16x8x2_t vn, uint8x8_t vm) {
+ return vluti4q_lane_p16_x2(vn, vm, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_p16_x2(
+// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0
+// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1
+// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANEQ_X24]]
+//
+poly16x8_t test_vluti4q_laneq_p16_x2(poly16x8x2_t vn, uint8x16_t vm) {
+ return vluti4q_laneq_p16_x2(vn, vm, 0);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 6c50b18ee583fd..6727ee69d7b3ee 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -565,6 +565,41 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
def int_aarch64_neon_vcmla_rot270 : AdvSIMD_3VectorArg_Intrinsic;
}
+let TargetPrefix = "aarch64" in {
+def int_aarch64_neon_vluti2_lane : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [llvm_anyvector_ty, llvm_v8i8_ty,
+ llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+def int_aarch64_neon_vluti2_laneq : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [llvm_anyvector_ty, llvm_v16i8_ty,
+ llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+def int_aarch64_neon_vluti4q_lane: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>, llvm_v8i8_ty,
+ llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+def int_aarch64_neon_vluti4q_laneq: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>, llvm_v16i8_ty,
+ llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+
+def int_aarch64_neon_vluti4q_lane_x2:
+ DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>, LLVMMatchType<0>,
+ llvm_v8i8_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+
+def int_aarch64_neon_vluti4q_laneq_x2:
+ DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>, LLVMMatchType<0>,
+ llvm_v16i8_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+}
+
let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
class AdvSIMD_2Vector2Index_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index ab8251dc830147..16002011aedfbe 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -8239,11 +8239,11 @@ multiclass SIMDTableLookupTied<bit op, string asm> {
// AdvSIMD LUT
//----------------------------------------------------------------------------
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTableLookupIndexed<bit Q, bits<5> opc, RegisterOperand vectype,
+class BaseSIMDTableLookupIndexed<bit Q, bits<5> opc,
RegisterOperand listtype, Operand idx_type,
string asm, string kind>
- : I<(outs vectype:$Rd),
- (ins listtype:$Rn, vectype:$Rm, idx_type:$idx),
+ : I<(outs V128:$Rd),
+ (ins listtype:$Rn, V128:$Rm, idx_type:$idx),
asm, "\t$Rd" # kind # ", $Rn, $Rm$idx", "", []>,
Sched<[]> {
bits<5> Rd;
@@ -8263,22 +8263,22 @@ class BaseSIMDTableLookupIndexed<bit Q, bits<5> opc, RegisterOperand vectype,
}
multiclass BaseSIMDTableLookupIndexed2<string asm> {
- def v16f8 : BaseSIMDTableLookupIndexed<0b1, {0b10,?,?,0b1}, V128, VecListOne16b, VectorIndexS, asm, ".16b"> {
+ def _B : BaseSIMDTableLookupIndexed<0b1, {0b10,?,?,0b1}, VecListOne16b, VectorIndexS32b_timm, asm, ".16b"> {
bits<2> idx;
let Inst{14-13} = idx;
}
- def v8f16 : BaseSIMDTableLookupIndexed<0b1, {0b11,?,?,?}, V128, VecListOne8h, VectorIndexH, asm, ".8h" > {
+ def _H : BaseSIMDTableLookupIndexed<0b1, {0b11,?,?,?}, VecListOne8h, VectorIndexH32b_timm, asm, ".8h" > {
bits<3> idx;
let Inst{14-12} = idx;
}
}
multiclass BaseSIMDTableLookupIndexed4<string asm> {
- def v16f8 : BaseSIMDTableLookupIndexed<0b1, {0b01,?,0b10}, V128, VecListOne16b, VectorIndexD, asm, ".16b"> {
+ def _B : BaseSIMDTableLookupIndexed<0b1, {0b01,?,0b10}, VecListOne16b, VectorIndexD32b_timm, asm, ".16b"> {
bit idx;
let Inst{14} = idx;
}
- def v8f16 : BaseSIMDTableLookupIndexed<0b1, {0b01,?,?,0b1}, V128, VecListTwo8h, VectorIndexS, asm, ".8h" > {
+ def _H : BaseSIMDTableLookupIndexed<0b1, {0b01,?,?,0b1}, VecListTwo8h, VectorIndexS32b_timm, asm, ".8h" > {
bits<2> idx;
let Inst{14-13} = idx;
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c659697c3a1be3..ccef85bfaa8afc 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6616,6 +6616,46 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
let Predicates = [HasLUT] in {
defm LUT2 : BaseSIMDTableLookupIndexed2<"luti2">;
defm LUT4 : BaseSIMDTableLookupIndexed4<"luti4">;
+
+ multiclass Luti2_patterns<Instruction Instr, ValueType VT64, ValueType VT128>{
+ def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT64:$Rn,
+ v8i8:$Rm, i32:$idx)),
+ (Instr (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub),
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>;
+ def : Pat<(VT128 (int_aarch64_neon_vluti2_laneq VT64:$Rn,
+ v16i8:$Rm, i32:$idx)),
+ (Instr (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub),
+ V128:$Rm, VectorIndexS32b_timm:$idx)>;
+ def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT128:$Rn,
+ v8i8:$Rm, i32:$idx)),
+ (Instr V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),
+ VectorIndexS32b_timm:$idx)>;
+ def : Pat<(VT128 (int_aarch64_neon_vluti2_laneq VT128:$Rn,
+ v16i8:$Rm, i32:$idx)),
+ (Instr V128:$Rn, V128:$Rm, VectorIndexS32b_timm:$idx)>;
+ }
+
+ defm : Luti2_patterns<LUT2_B, v8i8, v16i8>;
+ defm : Luti2_patterns<LUT2_H, v4i16, v8i16>;
+ defm : Luti2_patterns<LUT2_H, v4f16, v8f16>;
+ defm : Luti2_patterns<LUT2_H, v4bf16, v8bf16>;
+
+ def : Pat<(v16i8 (int_aarch64_neon_vluti4q_laneq v16i8:$Rn,
+ v16i8:$Rm, i32:$idx)),
+ (LUT4_B VecListOne16b:$Rn, V128:$Rm, VectorIndexD32b_timm:$idx)>;
+ def : Pat<(v16i8 (int_aarch64_neon_vluti4q_lane v16i8:$Rn,
+ v8i8:$Rm, i32:$idx)),
+ (LUT4_B VecListOne16b:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexD32b_timm:$idx)>;
+
+ foreach VT = [v8i16, v8f16, v8bf16] in {
+ def : Pat<(VT (int_aarch64_neon_vluti4q_laneq_x2 VT:$Rn1,
+ VT:$Rn2, v16i8:$Rm, i32:$idx)),
+ (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>;
+ def : Pat<(VT (int_aarch64_neon_vluti4q_lane_x2 VT:$Rn1,
+ VT:$Rn2, v8i8:$Rm, i32:$idx)),
+ (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1),
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>;
+ }
}
//----------------------------------------------------------------------------
diff --git a/llvm/test/CodeGen/AArch64/neon-luti.ll b/llvm/test/CodeGen/AArch64/neon-luti.ll
new file mode 100644
index 00000000000000..54366627537626
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-luti.ll
@@ -0,0 +1,253 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon,+lut,+bf16 | FileCheck %s
+
+define <16 x i8> @test_luti2_lane_i8(<8 x i8> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2_lane_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: luti2 v0.16b, { v0.16b }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> %vn, <8 x i8> %vm, i32 0)
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_luti2_laneq_i8(<8 x i8> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2_laneq_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: luti2 v0.16b, { v0.16b }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> %vn, <16 x i8> %vm, i32 0)
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_luti2q_lane_i8(<16 x i8> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2q_lane_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: luti2 v0.16b, { v0.16b }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> %vn, <8 x i8> %vm, i32 0)
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_luti2q_laneq_i8(<16 x i8> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2q_laneq_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti2 v0.16b, { v0.16b }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> %vn, <16 x i8> %vm, i32 0)
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @test_luti2_lane_i16(<4 x i16> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2_lane_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16(<4 x i16> %vn, <8 x i8> %vm, i32 0)
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_luti2_laneq_i16(<4 x i16> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2_laneq_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v4i16(<4 x i16> %vn, <16 x i8> %vm, i32 0)
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_luti2q_lane_i16(<4 x i16> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2q_lane_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16(<4 x i16> %vn, <8 x i8> %vm, i32 0)
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_luti2q_laneq_i16(<8 x i16> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2q_laneq_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.laneq.v8i16.v8i16(<8 x i16> %vn, <16 x i8> %vm, i32 0)
+ ret <8 x i16> %res
+}
+
+define <8 x half> @test_luti2_lane_f16(<4 x half> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2_lane_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v4f16(<4 x half> %vn, <8 x i8> %vm, i32 0)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_luti2_laneq_f16(<4 x half> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2_laneq_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.laneq.v8f16.v4i16(<4 x half> %vn, <16 x i8> %vm, i32 0)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_luti2q_lane_f16(<8 x half> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2q_lane_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v8f16(<8 x half> %vn, <8 x i8> %vm, i32 0)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_luti2q_laneq_f16(<8 x half> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2q_laneq_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.laneq.v8f16.v8f16(<8 x half> %vn, <16 x i8> %vm, i32 0)
+ ret <8 x half> %res
+}
+
+define <8 x bfloat> @test_luti2_lane_bf16(<4 x bfloat> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2_lane_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v4bf16(<4 x bfloat> %vn, <8 x i8> %vm, i32 0)
+ ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_luti2_laneq_bf16(<4 x bfloat> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2_laneq_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.laneq.v8bf16.v4bf16(<4 x bfloat> %vn, <16 x i8> %vm, i32 0)
+ ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_luti2q_lane_bf16(<4 x bfloat> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti2q_lane_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v8bf16(<4 x bfloat> %vn, <8 x i8> %vm, i32 0)
+ ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_luti2q_laneq_bf16(<8 x bfloat> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti2q_laneq_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.laneq.v8bf16.v8bf16(<8 x bfloat> %vn, <16 x i8> %vm, i32 0)
+ ret <8 x bfloat> %res
+}
+
+define <16 x i8> @test_luti4q_lane_i8(<16 x i8> %vn, <8 x i8> %vm){
+; CHECK-LABEL: test_luti4q_lane_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: luti4 v0.16b, { v0.16b }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> %vn, <8 x i8> %vm, i32 0)
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_luti4q_laneq_i8(<16 x i8> %vn, <16 x i8> %vm){
+; CHECK-LABEL: test_luti4q_laneq_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: luti4 v0.16b, { v0.16b }, v1[0]
+; CHECK-NEXT: ret
+ %res= tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> %vn, <16 x i8> %vm, i32 0)
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @test_luti4q_lane_x2_i16(<8 x i16> %vn1, <8 x i16> %vn2, <8 x i8> %vm){
+; CHECK-LABEL: test_luti4q_lane_x2_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1]
+; CHECK-NEXT: ret
+ %res= tail call <8 x i16> @llvm.aarch64.neon.vluti4q.lane.x2.v8i16(<8 x i16> %vn1, <8 x i16> %vn2, <8 x i8> %vm, i32 1)
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_luti4q_laneq_x2_i16(<8 x i16> %vn1, <8 x i16> %vn2, <16 x i8> %vm){
+; CHECK-LABEL: test_luti4q_laneq_x2_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1]
+; CHECK-NEXT: ret
+ %res= tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> %vn1, <8 x i16> %vn2, <16 x i8> %vm, i32 1)
+ ret <8 x i16> %res
+}
+
+define <8 x half> @test_luti4q_lane_x2_f16(<8 x half>%vn1, <8 x half> %vn2, <8 x i8> %vm){
+; CHECK-LABEL: test_luti4q_lane_x2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1]
+; CHECK-NEXT: ret
+ %res= tail call <8 x half> @llvm.aarch64.neon.vluti4q.lane.x2.v8f16(<8 x half> %vn1, <8 x half> %vn2, <8 x i8> %vm, i32 1)
+ ret <8 x half> %res
+}
+
+
+define <8 x half> @test_luti4q_laneq_x2_f16(<8 x half>%vn1, <8 x half> %vn2, <16 x i8> %vm){
+; CHECK-LABEL: test_luti4q_laneq_x2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1]
+; CHECK-NEXT: ret
+ %res= tail call <8 x half> @llvm.aarch64.neon.vluti4q.laneq.x2.v8f16(<8 x half> %vn1, <8 x half> %vn2, <16 x i8> %vm, i32 1)
+ ret <8 x half> %res
+}
+
+define <8 x bfloat> @test_luti4q_laneq_x2_bf16(<8 x bfloat>%vn1, <8 x bfloat> %vn2, <16 x i8> %vm){
+; CHECK-LABEL: test_luti4q_laneq_x2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1]
+; CHECK-NEXT: ret
+ %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.laneq.x2.v8bf16(<8 x bfloat> %vn1, <8 x bfloat> %vn2, <16 x i8> %vm, i32 1)
+ ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_luti4q_lane_x2_bf16(<8 x bfloat>%vn1, <8 x bfloat> %vn2, <8 x i8> %vm){
+; CHECK-LABEL: test_luti4q_lane_x2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1]
+; CHECK-NEXT: ret
+ %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.lane.x2.v8bf16(<8 x bfloat> %vn1, <8 x bfloat> %vn2, <8 x i8> %vm, i32 1)
+ ret <8 x bfloat> %res
+}
More information about the cfe-commits
mailing list