[clang] b000f90 - [RISCV] Add riscv_packed_simd.h for P extension intrinsics (#181115)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Jun 10 02:00:42 PDT 2026
Author: SiHuaN
Date: 2026-06-10T17:00:37+08:00
New Revision: b000f9032911f32c0e68e373e083ccc90aae0005
URL: https://github.com/llvm/llvm-project/commit/b000f9032911f32c0e68e373e083ccc90aae0005
DIFF: https://github.com/llvm/llvm-project/commit/b000f9032911f32c0e68e373e083ccc90aae0005.diff
LOG: [RISCV] Add riscv_packed_simd.h for P extension intrinsics (#181115)
Add `riscv_packed_simd.h` with initial RISC-V P extension intrinsics, covering:
- Packed Splat
- Packed Addition and Subtraction
- Packed Addition with Scalar
- Packed Saturating Addition and Subtraction
- Packed Shift-Add
- Packed Minimum and Maximum
- Packed Shifts
- Packed Logical Operations
The intrinsics are implemented as thin wrappers over standard C operators
and existing generic builtins (`__builtin_elementwise_add_sat` etc.), letting
the RISC-V backend lower the resulting `<N x iN>` IR to P-ext instructions.
No new clang builtins or `llvm.riscv.*` intrinsics are introduced.
Spec: https://github.com/riscv/riscv-p-spec/blob/master/P-ext-intrinsics.adoc
Added:
clang/lib/Headers/riscv_packed_simd.h
clang/test/CodeGen/RISCV/rvp-intrinsics.c
cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
Modified:
clang/lib/Headers/CMakeLists.txt
Removed:
################################################################################
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ce34f8b9410a7..439f2725168ba 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -140,6 +140,7 @@ set(riscv_files
riscv_corev_alu.h
riscv_mips.h
riscv_nds.h
+ riscv_packed_simd.h
sifive_vector.h
)
diff --git a/clang/lib/Headers/riscv_packed_simd.h b/clang/lib/Headers/riscv_packed_simd.h
new file mode 100644
index 0000000000000..828cb90f8034a
--- /dev/null
+++ b/clang/lib/Headers/riscv_packed_simd.h
@@ -0,0 +1,306 @@
+/*===---- riscv_packed_simd.h - RISC-V P intrinsics ------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_PACKED_SIMD_H
+#define __RISCV_PACKED_SIMD_H
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Packed SIMD Types */
+
+typedef int8_t int8x4_t __attribute__((__vector_size__(4)));
+typedef uint8_t uint8x4_t __attribute__((__vector_size__(4)));
+typedef int16_t int16x2_t __attribute__((__vector_size__(4)));
+typedef uint16_t uint16x2_t __attribute__((__vector_size__(4)));
+
+typedef int8_t int8x8_t __attribute__((__vector_size__(8)));
+typedef uint8_t uint8x8_t __attribute__((__vector_size__(8)));
+typedef int16_t int16x4_t __attribute__((__vector_size__(8)));
+typedef uint16_t uint16x4_t __attribute__((__vector_size__(8)));
+typedef int32_t int32x2_t __attribute__((__vector_size__(8)));
+typedef uint32_t uint32x2_t __attribute__((__vector_size__(8)));
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+#define __packed_splat2(ty, x) ((ty){(x), (x)})
+#define __packed_splat4(ty, x) ((ty){(x), (x), (x), (x)})
+#define __packed_splat8(ty, x) ((ty){(x), (x), (x), (x), (x), (x), (x), (x)})
+
+#define __packed_splat(name, ty, scalar_ty, splat) \
+ static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(scalar_ty __x) { \
+ return splat(ty, __x); \
+ }
+
+#define __packed_shift(name, ty, op, mask) \
+ static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, \
+ unsigned __rs2) { \
+ return __rs1 op(__rs2 & (mask)); \
+ }
+#define __packed_shift8(name, ty, op) __packed_shift(name, ty, op, 0x7)
+#define __packed_shift16(name, ty, op) __packed_shift(name, ty, op, 0xf)
+#define __packed_shift32(name, ty, op) __packed_shift(name, ty, op, 0x1f)
+
+#define __packed_scalar_binary_op(name, ty, scalar_ty, op, splat) \
+ static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, \
+ scalar_ty __rs2) { \
+ return __rs1 op splat(ty, __rs2); \
+ }
+
+#define __packed_binary_op(name, ty, op) \
+ static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
+ return __rs1 op __rs2; \
+ }
+
+#define __packed_unary_op(name, ty, op) \
+ static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1) { \
+ return op __rs1; \
+ }
+
+#define __packed_binary_builtin(name, ty, builtin) \
+ static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
+ return builtin(__rs1, __rs2); \
+ }
+
+#define __packed_sh1add(name, ty) \
+ static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
+ return (__rs1 << 1) + __rs2; \
+ }
+
+/* TODO: switch to sadd_sat(__builtin_elementwise_shl_sat(a, 1), b) once a
+ * generic elementwise shl_sat builtin exists. sadd_sat(a, a) is equivalent
+ * for signed types and the backend's saturating_shl1 PatFrags matches both
+ * shapes. */
+#define __packed_sh1sadd(name, ty) \
+ static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
+ return __builtin_elementwise_add_sat( \
+ __builtin_elementwise_add_sat(__rs1, __rs1), __rs2); \
+ }
+
+// clang-format off: macro call sites have no trailing semicolons, which
+// confuses clang-format into a deeply nested expression.
+
+/* Packed Splat (32-bit) */
+__packed_splat(pmv_s_u8x4, uint8x4_t, uint8_t, __packed_splat4)
+__packed_splat(pmv_s_i8x4, int8x4_t, int8_t, __packed_splat4)
+__packed_splat(pmv_s_u16x2, uint16x2_t, uint16_t, __packed_splat2)
+__packed_splat(pmv_s_i16x2, int16x2_t, int16_t, __packed_splat2)
+
+/* Packed Splat (64-bit) */
+__packed_splat(pmv_s_u8x8, uint8x8_t, uint8_t, __packed_splat8)
+__packed_splat(pmv_s_i8x8, int8x8_t, int8_t, __packed_splat8)
+__packed_splat(pmv_s_u16x4, uint16x4_t, uint16_t, __packed_splat4)
+__packed_splat(pmv_s_i16x4, int16x4_t, int16_t, __packed_splat4)
+__packed_splat(pmv_s_u32x2, uint32x2_t, uint32_t, __packed_splat2)
+__packed_splat(pmv_s_i32x2, int32x2_t, int32_t, __packed_splat2)
+
+/* Packed Addition and Subtraction (32-bit) */
+__packed_binary_op(padd_i8x4, int8x4_t, +)
+__packed_binary_op(padd_u8x4, uint8x4_t, +)
+__packed_binary_op(padd_i16x2, int16x2_t, +)
+__packed_binary_op(padd_u16x2, uint16x2_t, +)
+__packed_binary_op(psub_i8x4, int8x4_t, -)
+__packed_binary_op(psub_u8x4, uint8x4_t, -)
+__packed_binary_op(psub_i16x2, int16x2_t, -)
+__packed_binary_op(psub_u16x2, uint16x2_t, -)
+__packed_unary_op(pneg_i8x4, int8x4_t, -)
+__packed_unary_op(pneg_i16x2, int16x2_t, -)
+
+/* Packed Addition and Subtraction (64-bit) */
+__packed_binary_op(padd_i8x8, int8x8_t, +)
+__packed_binary_op(padd_u8x8, uint8x8_t, +)
+__packed_binary_op(padd_i16x4, int16x4_t, +)
+__packed_binary_op(padd_u16x4, uint16x4_t, +)
+__packed_binary_op(padd_i32x2, int32x2_t, +)
+__packed_binary_op(padd_u32x2, uint32x2_t, +)
+__packed_binary_op(psub_i8x8, int8x8_t, -)
+__packed_binary_op(psub_u8x8, uint8x8_t, -)
+__packed_binary_op(psub_i16x4, int16x4_t, -)
+__packed_binary_op(psub_u16x4, uint16x4_t, -)
+__packed_binary_op(psub_i32x2, int32x2_t, -)
+__packed_binary_op(psub_u32x2, uint32x2_t, -)
+__packed_unary_op(pneg_i8x8, int8x8_t, -)
+__packed_unary_op(pneg_i16x4, int16x4_t, -)
+__packed_unary_op(pneg_i32x2, int32x2_t, -)
+
+/* Packed Addition with Scalar (32-bit) */
+__packed_scalar_binary_op(padd_s_u8x4, uint8x4_t, uint8_t, +, __packed_splat4)
+__packed_scalar_binary_op(padd_s_i8x4, int8x4_t, int8_t, +, __packed_splat4)
+__packed_scalar_binary_op(padd_s_u16x2, uint16x2_t, uint16_t, +,
+ __packed_splat2)
+__packed_scalar_binary_op(padd_s_i16x2, int16x2_t, int16_t, +,
+ __packed_splat2)
+
+/* Packed Addition with Scalar (64-bit) */
+__packed_scalar_binary_op(padd_s_u8x8, uint8x8_t, uint8_t, +, __packed_splat8)
+__packed_scalar_binary_op(padd_s_i8x8, int8x8_t, int8_t, +, __packed_splat8)
+__packed_scalar_binary_op(padd_s_u16x4, uint16x4_t, uint16_t, +,
+ __packed_splat4)
+__packed_scalar_binary_op(padd_s_i16x4, int16x4_t, int16_t, +,
+ __packed_splat4)
+__packed_scalar_binary_op(padd_s_u32x2, uint32x2_t, uint32_t, +,
+ __packed_splat2)
+__packed_scalar_binary_op(padd_s_i32x2, int32x2_t, int32_t, +,
+ __packed_splat2)
+
+/* Packed Saturating Addition and Subtraction (32-bit) */
+__packed_binary_builtin(psadd_i8x4, int8x4_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psadd_i16x2, int16x2_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u8x4, uint8x4_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u16x2, uint16x2_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(pssub_i8x4, int8x4_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssub_i16x2, int16x2_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u8x4, uint8x4_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u16x2, uint16x2_t, __builtin_elementwise_sub_sat)
+
+/* Packed Saturating Addition and Subtraction (64-bit) */
+__packed_binary_builtin(psadd_i8x8, int8x8_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psadd_i16x4, int16x4_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psadd_i32x2, int32x2_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u8x8, uint8x8_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u16x4, uint16x4_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u32x2, uint32x2_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(pssub_i8x8, int8x8_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssub_i16x4, int16x4_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssub_i32x2, int32x2_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u8x8, uint8x8_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u16x4, uint16x4_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u32x2, uint32x2_t, __builtin_elementwise_sub_sat)
+
+/* Packed Shift-Add (32-bit) */
+__packed_sh1add(psh1add_i16x2, int16x2_t)
+__packed_sh1add(psh1add_u16x2, uint16x2_t)
+__packed_sh1sadd(pssh1sadd_i16x2, int16x2_t)
+
+/* Packed Shift-Add (64-bit) */
+__packed_sh1add(psh1add_i16x4, int16x4_t)
+__packed_sh1add(psh1add_u16x4, uint16x4_t)
+__packed_sh1add(psh1add_i32x2, int32x2_t)
+__packed_sh1add(psh1add_u32x2, uint32x2_t)
+__packed_sh1sadd(pssh1sadd_i16x4, int16x4_t)
+__packed_sh1sadd(pssh1sadd_i32x2, int32x2_t)
+
+/* Packed Minimum and Maximum (32-bit) */
+__packed_binary_builtin(pmin_i8x4, int8x4_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmin_i16x2, int16x2_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u8x4, uint8x4_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u16x2, uint16x2_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmax_i8x4, int8x4_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmax_i16x2, int16x2_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u8x4, uint8x4_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u16x2, uint16x2_t, __builtin_elementwise_max)
+
+/* Packed Minimum and Maximum (64-bit) */
+__packed_binary_builtin(pmin_i8x8, int8x8_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmin_i16x4, int16x4_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmin_i32x2, int32x2_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u8x8, uint8x8_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u16x4, uint16x4_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u32x2, uint32x2_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmax_i8x8, int8x8_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmax_i16x4, int16x4_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmax_i32x2, int32x2_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u8x8, uint8x8_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u16x4, uint16x4_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u32x2, uint32x2_t, __builtin_elementwise_max)
+
+/* Packed Shifts (32-bit) */
+__packed_shift8(psll_s_u8x4, uint8x4_t, <<)
+__packed_shift8(psll_s_i8x4, int8x4_t, <<)
+__packed_shift16(psll_s_u16x2, uint16x2_t, <<)
+__packed_shift16(psll_s_i16x2, int16x2_t, <<)
+__packed_shift8(psrl_s_u8x4, uint8x4_t, >>)
+__packed_shift16(psrl_s_u16x2, uint16x2_t, >>)
+__packed_shift8(psra_s_i8x4, int8x4_t, >>)
+__packed_shift16(psra_s_i16x2, int16x2_t, >>)
+
+/* Packed Shifts (64-bit) */
+__packed_shift8(psll_s_u8x8, uint8x8_t, <<)
+__packed_shift8(psll_s_i8x8, int8x8_t, <<)
+__packed_shift16(psll_s_u16x4, uint16x4_t, <<)
+__packed_shift16(psll_s_i16x4, int16x4_t, <<)
+__packed_shift32(psll_s_u32x2, uint32x2_t, <<)
+__packed_shift32(psll_s_i32x2, int32x2_t, <<)
+__packed_shift8(psrl_s_u8x8, uint8x8_t, >>)
+__packed_shift16(psrl_s_u16x4, uint16x4_t, >>)
+__packed_shift32(psrl_s_u32x2, uint32x2_t, >>)
+__packed_shift8(psra_s_i8x8, int8x8_t, >>)
+__packed_shift16(psra_s_i16x4, int16x4_t, >>)
+__packed_shift32(psra_s_i32x2, int32x2_t, >>)
+
+/* Packed Logical Operations (32-bit) */
+__packed_binary_op(pand_i8x4, int8x4_t, &)
+__packed_binary_op(pand_u8x4, uint8x4_t, &)
+__packed_binary_op(pand_i16x2, int16x2_t, &)
+__packed_binary_op(pand_u16x2, uint16x2_t, &)
+__packed_binary_op(por_i8x4, int8x4_t, |)
+__packed_binary_op(por_u8x4, uint8x4_t, |)
+__packed_binary_op(por_i16x2, int16x2_t, |)
+__packed_binary_op(por_u16x2, uint16x2_t, |)
+__packed_binary_op(pxor_i8x4, int8x4_t, ^)
+__packed_binary_op(pxor_u8x4, uint8x4_t, ^)
+__packed_binary_op(pxor_i16x2, int16x2_t, ^)
+__packed_binary_op(pxor_u16x2, uint16x2_t, ^)
+__packed_unary_op(pnot_i8x4, int8x4_t, ~)
+__packed_unary_op(pnot_u8x4, uint8x4_t, ~)
+__packed_unary_op(pnot_i16x2, int16x2_t, ~)
+__packed_unary_op(pnot_u16x2, uint16x2_t, ~)
+
+/* Packed Logical Operations (64-bit) */
+__packed_binary_op(pand_i8x8, int8x8_t, &)
+__packed_binary_op(pand_u8x8, uint8x8_t, &)
+__packed_binary_op(pand_i16x4, int16x4_t, &)
+__packed_binary_op(pand_u16x4, uint16x4_t, &)
+__packed_binary_op(pand_i32x2, int32x2_t, &)
+__packed_binary_op(pand_u32x2, uint32x2_t, &)
+__packed_binary_op(por_i8x8, int8x8_t, |)
+__packed_binary_op(por_u8x8, uint8x8_t, |)
+__packed_binary_op(por_i16x4, int16x4_t, |)
+__packed_binary_op(por_u16x4, uint16x4_t, |)
+__packed_binary_op(por_i32x2, int32x2_t, |)
+__packed_binary_op(por_u32x2, uint32x2_t, |)
+__packed_binary_op(pxor_i8x8, int8x8_t, ^)
+__packed_binary_op(pxor_u8x8, uint8x8_t, ^)
+__packed_binary_op(pxor_i16x4, int16x4_t, ^)
+__packed_binary_op(pxor_u16x4, uint16x4_t, ^)
+__packed_binary_op(pxor_i32x2, int32x2_t, ^)
+__packed_binary_op(pxor_u32x2, uint32x2_t, ^)
+__packed_unary_op(pnot_i8x8, int8x8_t, ~)
+__packed_unary_op(pnot_u8x8, uint8x8_t, ~)
+__packed_unary_op(pnot_i16x4, int16x4_t, ~)
+__packed_unary_op(pnot_u16x4, uint16x4_t, ~)
+__packed_unary_op(pnot_i32x2, int32x2_t, ~)
+__packed_unary_op(pnot_u32x2, uint32x2_t, ~)
+
+// clang-format on
+
+#undef __packed_splat2
+#undef __packed_splat4
+#undef __packed_splat8
+#undef __packed_splat
+#undef __packed_shift
+#undef __packed_shift8
+#undef __packed_shift16
+#undef __packed_shift32
+#undef __packed_scalar_binary_op
+#undef __packed_binary_op
+#undef __packed_unary_op
+#undef __packed_binary_builtin
+#undef __packed_sh1add
+#undef __packed_sh1sadd
+#undef __DEFAULT_FN_ATTRS
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __RISCV_PACKED_SIMD_H */
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
new file mode 100644
index 0000000000000..73db0bee19def
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -0,0 +1,3349 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple riscv32 -target-feature +experimental-p \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=sroa,instcombine | FileCheck %s --check-prefix=RV32
+// RUN: %clang_cc1 -triple riscv64 -target-feature +experimental-p \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=sroa,instcombine | FileCheck %s --check-prefix=RV64
+
+#include <riscv_packed_simd.h>
+
+/* Packed Splat (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_pmv_s_u8x4(
+// RV32-SAME: i8 noundef zeroext [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i64 0
+// RV32-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[TMP0:%.*]] = bitcast <4 x i8> [[VECINIT3_I]] to i32
+// RV32-NEXT: ret i32 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmv_s_u8x4(
+// RV64-SAME: i8 noundef zeroext [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i64 0
+// RV64-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[TMP0:%.*]] = bitcast <4 x i8> [[VECINIT3_I]] to i32
+// RV64-NEXT: ret i32 [[TMP0]]
+//
+uint8x4_t test_pmv_s_u8x4(uint8_t x) {
+ return __riscv_pmv_s_u8x4(x);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmv_s_i8x4(
+// RV32-SAME: i8 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i64 0
+// RV32-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[TMP0:%.*]] = bitcast <4 x i8> [[VECINIT3_I]] to i32
+// RV32-NEXT: ret i32 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmv_s_i8x4(
+// RV64-SAME: i8 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i64 0
+// RV64-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[TMP0:%.*]] = bitcast <4 x i8> [[VECINIT3_I]] to i32
+// RV64-NEXT: ret i32 [[TMP0]]
+//
+int8x4_t test_pmv_s_i8x4(int8_t x) {
+ return __riscv_pmv_s_i8x4(x);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmv_s_u16x2(
+// RV32-SAME: i16 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[X]], i64 0
+// RV32-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[TMP0:%.*]] = bitcast <2 x i16> [[VECINIT1_I]] to i32
+// RV32-NEXT: ret i32 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmv_s_u16x2(
+// RV64-SAME: i16 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[X]], i64 0
+// RV64-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[TMP0:%.*]] = bitcast <2 x i16> [[VECINIT1_I]] to i32
+// RV64-NEXT: ret i32 [[TMP0]]
+//
+uint16x2_t test_pmv_s_u16x2(uint16_t x) {
+ return __riscv_pmv_s_u16x2(x);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmv_s_i16x2(
+// RV32-SAME: i16 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[X]], i64 0
+// RV32-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[TMP0:%.*]] = bitcast <2 x i16> [[VECINIT1_I]] to i32
+// RV32-NEXT: ret i32 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmv_s_i16x2(
+// RV64-SAME: i16 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[X]], i64 0
+// RV64-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[TMP0:%.*]] = bitcast <2 x i16> [[VECINIT1_I]] to i32
+// RV64-NEXT: ret i32 [[TMP0]]
+//
+int16x2_t test_pmv_s_i16x2(int16_t x) {
+ return __riscv_pmv_s_i16x2(x);
+}
+
+/* Packed Splat (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_u8x8(
+// RV32-SAME: i8 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i64 0
+// RV32-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[VECINIT7_I]] to i64
+// RV32-NEXT: ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_u8x8(
+// RV64-SAME: i8 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i64 0
+// RV64-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[VECINIT7_I]] to i64
+// RV64-NEXT: ret i64 [[TMP0]]
+//
+uint8x8_t test_pmv_s_u8x8(uint8_t x) {
+ return __riscv_pmv_s_u8x8(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_i8x8(
+// RV32-SAME: i8 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i64 0
+// RV32-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[VECINIT7_I]] to i64
+// RV32-NEXT: ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_i8x8(
+// RV64-SAME: i8 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i64 0
+// RV64-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[VECINIT7_I]] to i64
+// RV64-NEXT: ret i64 [[TMP0]]
+//
+int8x8_t test_pmv_s_i8x8(int8_t x) {
+ return __riscv_pmv_s_i8x8(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_u16x4(
+// RV32-SAME: i16 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+// RV32-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to i64
+// RV32-NEXT: ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_u16x4(
+// RV64-SAME: i16 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+// RV64-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to i64
+// RV64-NEXT: ret i64 [[TMP0]]
+//
+uint16x4_t test_pmv_s_u16x4(uint16_t x) {
+ return __riscv_pmv_s_u16x4(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_i16x4(
+// RV32-SAME: i16 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+// RV32-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to i64
+// RV32-NEXT: ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_i16x4(
+// RV64-SAME: i16 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+// RV64-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to i64
+// RV64-NEXT: ret i64 [[TMP0]]
+//
+int16x4_t test_pmv_s_i16x4(int16_t x) {
+ return __riscv_pmv_s_i16x4(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_u32x2(
+// RV32-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i64 0
+// RV32-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to i64
+// RV32-NEXT: ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_u32x2(
+// RV64-SAME: i32 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i64 0
+// RV64-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to i64
+// RV64-NEXT: ret i64 [[TMP0]]
+//
+uint32x2_t test_pmv_s_u32x2(uint32_t x) {
+ return __riscv_pmv_s_u32x2(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_i32x2(
+// RV32-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i64 0
+// RV32-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to i64
+// RV32-NEXT: ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_i32x2(
+// RV64-SAME: i32 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i64 0
+// RV64-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to i64
+// RV64-NEXT: ret i64 [[TMP0]]
+//
+int32x2_t test_pmv_s_i32x2(int32_t x) {
+ return __riscv_pmv_s_i32x2(x);
+}
+
+/* Packed Addition and Subtraction (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT: [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT: [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_padd_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT: [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT: [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_padd_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_padd_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_padd_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_psub_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_psub_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_psub_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_psub_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pneg_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <4 x i8> zeroinitializer, [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT: ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pneg_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <4 x i8> zeroinitializer, [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT: ret i32 [[TMP1]]
+//
+int8x4_t test_pneg_i8x4(int8x4_t a) {
+ return __riscv_pneg_i8x4(a);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pneg_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <2 x i16> zeroinitializer, [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT: ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pneg_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <2 x i16> zeroinitializer, [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT: ret i32 [[TMP1]]
+//
+int16x2_t test_pneg_i16x2(int16x2_t a) {
+ return __riscv_pneg_i16x2(a);
+}
+
+/* Packed Addition and Subtraction (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_padd_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_padd_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_padd_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_padd_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_padd_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_padd_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_psub_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_psub_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_psub_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_psub_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_psub_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_psub_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pneg_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pneg_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+int8x8_t test_pneg_i8x8(int8x8_t a) {
+ return __riscv_pneg_i8x8(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pneg_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pneg_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+int16x4_t test_pneg_i16x4(int16x4_t a) {
+ return __riscv_pneg_i16x4(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pneg_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pneg_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+int32x2_t test_pneg_i32x2(int32x2_t a) {
+ return __riscv_pneg_i32x2(a);
+}
+
+/* Packed Addition with Scalar (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT: [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT: ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT: [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT: ret i32 [[TMP1]]
+//
+uint8x4_t test_padd_s_u8x4(uint8x4_t a, uint8_t b) {
+ return __riscv_padd_s_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT: [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT: ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT: [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT: ret i32 [[TMP1]]
+//
+int8x4_t test_padd_s_i8x4(int8x4_t a, int8_t b) {
+ return __riscv_padd_s_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[B]], i64 0
+// RV32-NEXT: [[VECINIT2_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[ADD_I:%.*]] = add <2 x i16> [[VECINIT2_I]], [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT: ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[B]], i64 0
+// RV64-NEXT: [[VECINIT2_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[ADD_I:%.*]] = add <2 x i16> [[VECINIT2_I]], [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT: ret i32 [[TMP1]]
+//
+uint16x2_t test_padd_s_u16x2(uint16x2_t a, uint16_t b) {
+ return __riscv_padd_s_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[B]], i64 0
+// RV32-NEXT: [[VECINIT2_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[ADD_I:%.*]] = add <2 x i16> [[VECINIT2_I]], [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT: ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[B]], i64 0
+// RV64-NEXT: [[VECINIT2_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[ADD_I:%.*]] = add <2 x i16> [[VECINIT2_I]], [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT: ret i32 [[TMP1]]
+//
+int16x2_t test_padd_s_i16x2(int16x2_t a, int16_t b) {
+ return __riscv_padd_s_i16x2(a, b);
+}
+
+/* Packed Addition with Scalar (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT: [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[VECINIT8_I]], [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT: [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[VECINIT8_I]], [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+uint8x8_t test_padd_s_u8x8(uint8x8_t a, uint8_t b) {
+ return __riscv_padd_s_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT: [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[VECINIT8_I]], [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT: [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[VECINIT8_I]], [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+int8x8_t test_padd_s_i8x8(int8x8_t a, int8_t b) {
+ return __riscv_padd_s_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// RV32-NEXT: [[VECINIT4_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// RV64-NEXT: [[VECINIT4_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+uint16x4_t test_padd_s_u16x4(uint16x4_t a, uint16_t b) {
+ return __riscv_padd_s_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// RV32-NEXT: [[VECINIT4_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// RV64-NEXT: [[VECINIT4_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+int16x4_t test_padd_s_i16x4(int16x4_t a, int16_t b) {
+ return __riscv_padd_s_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// RV32-NEXT: [[VECINIT2_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[VECINIT2_I]], [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// RV64-NEXT: [[VECINIT2_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[VECINIT2_I]], [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+uint32x2_t test_padd_s_u32x2(uint32x2_t a, uint32_t b) {
+ return __riscv_padd_s_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// RV32-NEXT: [[VECINIT2_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[VECINIT2_I]], [[TMP0]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// RV64-NEXT: [[VECINIT2_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[VECINIT2_I]], [[TMP0]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+int32x2_t test_padd_s_i32x2(int32x2_t a, int32_t b) {
+ return __riscv_padd_s_i32x2(a, b);
+}
+
+/* Packed Saturating Addition and Subtraction (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_psadd_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psadd_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int8x4_t test_psadd_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_psadd_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psadd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psadd_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int16x2_t test_psadd_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_psadd_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psaddu_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psaddu_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint8x4_t test_psaddu_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_psaddu_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psaddu_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psaddu_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint16x2_t test_psaddu_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_psaddu_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssub_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssub_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int8x4_t test_pssub_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_pssub_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssub_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssub_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int16x2_t test_pssub_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_pssub_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssubu_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssubu_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint8x4_t test_pssubu_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_pssubu_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssubu_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssubu_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint16x2_t test_pssubu_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_pssubu_u16x2(a, b);
+}
+
+/* Packed Saturating Addition and Subtraction (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_psadd_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psadd_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int8x8_t test_psadd_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_psadd_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psadd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psadd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int16x4_t test_psadd_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_psadd_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psadd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psadd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int32x2_t test_psadd_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_psadd_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psaddu_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psaddu_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint8x8_t test_psaddu_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_psaddu_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psaddu_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psaddu_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint16x4_t test_psaddu_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_psaddu_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psaddu_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psaddu_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint32x2_t test_psaddu_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_psaddu_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssub_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssub_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int8x8_t test_pssub_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_pssub_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssub_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssub_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int16x4_t test_pssub_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_pssub_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssub_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssub_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int32x2_t test_pssub_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_pssub_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssubu_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssubu_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint8x8_t test_pssubu_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_pssubu_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssubu_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssubu_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint16x4_t test_pssubu_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_pssubu_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssubu_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssubu_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint32x2_t test_pssubu_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_pssubu_u32x2(a, b);
+}
+
+/* Packed Shift-Add (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_psh1add_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT: [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psh1add_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT: [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int16x2_t test_psh1add_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_psh1add_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psh1add_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT: [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psh1add_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT: [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint16x2_t test_psh1add_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_psh1add_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssh1sadd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP0]])
+// RV32-NEXT: [[ELT_SAT3_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ELT_SAT_I]], <2 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT3_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssh1sadd_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP0]])
+// RV64-NEXT: [[ELT_SAT3_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ELT_SAT_I]], <2 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT3_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int16x2_t test_pssh1sadd_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_pssh1sadd_i16x2(a, b);
+}
+
+/* Packed Shift-Add (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int16x4_t test_psh1add_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_psh1add_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint16x4_t test_psh1add_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_psh1add_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV32-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV64-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int32x2_t test_psh1add_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_psh1add_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV32-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV64-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint32x2_t test_psh1add_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_psh1add_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssh1sadd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP0]])
+// RV32-NEXT: [[ELT_SAT3_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[ELT_SAT_I]], <4 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT3_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssh1sadd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP0]])
+// RV64-NEXT: [[ELT_SAT3_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[ELT_SAT_I]], <4 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT3_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int16x4_t test_pssh1sadd_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_pssh1sadd_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssh1sadd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP0]])
+// RV32-NEXT: [[ELT_SAT3_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ELT_SAT_I]], <2 x i32> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT3_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssh1sadd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP0]])
+// RV64-NEXT: [[ELT_SAT3_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ELT_SAT_I]], <2 x i32> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT3_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int32x2_t test_pssh1sadd_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_pssh1sadd_i32x2(a, b);
+}
+
+/* Packed Minimum and Maximum (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_pmin_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT: [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmin_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT: [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int8x4_t test_pmin_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_pmin_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmin_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmin_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int16x2_t test_pmin_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_pmin_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pminu_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT: [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pminu_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT: [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint8x4_t test_pminu_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_pminu_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pminu_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pminu_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint16x2_t test_pminu_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_pminu_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmax_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT: [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmax_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT: [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int8x4_t test_pmax_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_pmax_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmax_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmax_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+int16x2_t test_pmax_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_pmax_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmaxu_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT: [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmaxu_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT: [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint8x4_t test_pmaxu_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_pmaxu_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmaxu_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT: [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
+// RV32-NEXT: ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmaxu_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT: [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
+// RV64-NEXT: ret i32 [[TMP2]]
+//
+uint16x2_t test_pmaxu_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_pmaxu_u16x2(a, b);
+}
+
+/* Packed Minimum and Maximum (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_pmin_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT: [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmin_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT: [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int8x8_t test_pmin_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_pmin_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmin_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmin_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int16x4_t test_pmin_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_pmin_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmin_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmin_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int32x2_t test_pmin_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_pmin_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pminu_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT: [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pminu_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT: [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint8x8_t test_pminu_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_pminu_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pminu_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pminu_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint16x4_t test_pminu_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_pminu_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pminu_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pminu_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint32x2_t test_pminu_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_pminu_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmax_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT: [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmax_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT: [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int8x8_t test_pmax_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_pmax_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmax_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.smax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmax_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.smax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int16x4_t test_pmax_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_pmax_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmax_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmax_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+int32x2_t test_pmax_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_pmax_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmaxu_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT: [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmaxu_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT: [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint8x8_t test_pmaxu_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_pmaxu_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmaxu_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT: [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.umax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmaxu_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT: [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.umax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint16x4_t test_pmaxu_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_pmaxu_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmaxu_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT: [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
+// RV32-NEXT: ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmaxu_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT: [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
+// RV64-NEXT: ret i64 [[TMP2]]
+//
+uint32x2_t test_pmaxu_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_pmaxu_u32x2(a, b);
+}
+
+/* Packed Shifts (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT: ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psll_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT: ret i32 [[TMP4]]
+//
+int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
+ return __riscv_psll_s_i8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT: ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psll_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT: ret i32 [[TMP4]]
+//
+uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
+ return __riscv_psll_s_u8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT: ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psll_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT: ret i32 [[TMP4]]
+//
+int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
+ return __riscv_psll_s_i16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT: ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psll_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT: ret i32 [[TMP4]]
+//
+uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
+ return __riscv_psll_s_u16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psra_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT: ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psra_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT: ret i32 [[TMP4]]
+//
+int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
+ return __riscv_psra_s_i8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT: ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psrl_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT: ret i32 [[TMP4]]
+//
+uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
+ return __riscv_psrl_s_u8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psra_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT: ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psra_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT: ret i32 [[TMP4]]
+//
+int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
+ return __riscv_psra_s_i16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT: ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psrl_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT: ret i32 [[TMP4]]
+//
+uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
+ return __riscv_psrl_s_u16x2(a, shamt);
+}
+
+/* Packed Shifts (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT: [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT: ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT: [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT: ret i64 [[TMP4]]
+//
+int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
+ return __riscv_psll_s_i8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT: [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT: ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT: [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT: ret i64 [[TMP4]]
+//
+uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
+ return __riscv_psll_s_u8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT: ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT: ret i64 [[TMP4]]
+//
+int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
+ return __riscv_psll_s_i16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT: ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT: ret i64 [[TMP4]]
+//
+uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
+ return __riscv_psll_s_u16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT: [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT: [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT: [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT: [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
+ return __riscv_psll_s_i32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT: [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT: [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT: [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT: [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned shamt) {
+ return __riscv_psll_s_u32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT: [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT: ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT: [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT: ret i64 [[TMP4]]
+//
+int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
+ return __riscv_psra_s_i8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT: [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT: ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT: [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT: ret i64 [[TMP4]]
+//
+uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
+ return __riscv_psrl_s_u8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT: ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT: ret i64 [[TMP4]]
+//
+int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
+ return __riscv_psra_s_i16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT: [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT: ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT: [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT: [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT: ret i64 [[TMP4]]
+//
+uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
+ return __riscv_psrl_s_u16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT: [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT: [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT: [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT: [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
+ return __riscv_psra_s_i32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT: [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT: [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT: [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT: [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT: [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT: [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned shamt) {
+ return __riscv_psrl_s_u32x2(a, shamt);
+}
+
+/* Packed Logical Operations (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_pand_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i32 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pand_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i32 [[AND_I1]]
+//
+int8x4_t test_pand_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_pand_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pand_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i32 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pand_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i32 [[AND_I1]]
+//
+uint8x4_t test_pand_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_pand_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pand_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i32 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pand_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i32 [[AND_I1]]
+//
+int16x2_t test_pand_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_pand_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pand_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i32 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pand_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i32 [[AND_I1]]
+//
+uint16x2_t test_pand_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_pand_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_por_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i32 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_por_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i32 [[OR_I1]]
+//
+int8x4_t test_por_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_por_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_por_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i32 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_por_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i32 [[OR_I1]]
+//
+uint8x4_t test_por_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_por_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_por_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i32 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_por_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i32 [[OR_I1]]
+//
+int16x2_t test_por_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_por_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_por_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i32 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_por_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i32 [[OR_I1]]
+//
+uint16x2_t test_por_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_por_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pxor_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i32 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pxor_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i32 [[XOR_I1]]
+//
+int8x4_t test_pxor_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_pxor_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pxor_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i32 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pxor_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i32 [[XOR_I1]]
+//
+uint8x4_t test_pxor_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_pxor_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pxor_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i32 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pxor_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i32 [[XOR_I1]]
+//
+int16x2_t test_pxor_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_pxor_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pxor_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i32 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pxor_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i32 [[XOR_I1]]
+//
+uint16x2_t test_pxor_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_pxor_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pnot_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[NOT_I:%.*]] = xor <4 x i8> [[TMP0]], splat (i8 -1)
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[NOT_I]] to i32
+// RV32-NEXT: ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pnot_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[NOT_I:%.*]] = xor <4 x i8> [[TMP0]], splat (i8 -1)
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[NOT_I]] to i32
+// RV64-NEXT: ret i32 [[TMP1]]
+//
+int8x4_t test_pnot_i8x4(int8x4_t a) {
+ return __riscv_pnot_i8x4(a);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pnot_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT: [[NOT_I:%.*]] = xor <4 x i8> [[TMP0]], splat (i8 -1)
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[NOT_I]] to i32
+// RV32-NEXT: ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pnot_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT: [[NOT_I:%.*]] = xor <4 x i8> [[TMP0]], splat (i8 -1)
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[NOT_I]] to i32
+// RV64-NEXT: ret i32 [[TMP1]]
+//
+uint8x4_t test_pnot_u8x4(uint8x4_t a) {
+ return __riscv_pnot_u8x4(a);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pnot_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[NOT_I:%.*]] = xor <2 x i16> [[TMP0]], splat (i16 -1)
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[NOT_I]] to i32
+// RV32-NEXT: ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pnot_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[NOT_I:%.*]] = xor <2 x i16> [[TMP0]], splat (i16 -1)
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[NOT_I]] to i32
+// RV64-NEXT: ret i32 [[TMP1]]
+//
+int16x2_t test_pnot_i16x2(int16x2_t a) {
+ return __riscv_pnot_i16x2(a);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pnot_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT: [[NOT_I:%.*]] = xor <2 x i16> [[TMP0]], splat (i16 -1)
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[NOT_I]] to i32
+// RV32-NEXT: ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pnot_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT: [[NOT_I:%.*]] = xor <2 x i16> [[TMP0]], splat (i16 -1)
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[NOT_I]] to i32
+// RV64-NEXT: ret i32 [[TMP1]]
+//
+uint16x2_t test_pnot_u16x2(uint16x2_t a) {
+ return __riscv_pnot_u16x2(a);
+}
+
+/* Packed Logical Operations (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_pand_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pand_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[AND_I1]]
+//
+int8x8_t test_pand_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_pand_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pand_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pand_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[AND_I1]]
+//
+uint8x8_t test_pand_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_pand_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pand_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pand_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[AND_I1]]
+//
+int16x4_t test_pand_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_pand_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pand_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pand_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[AND_I1]]
+//
+uint16x4_t test_pand_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_pand_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pand_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pand_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[AND_I1]]
+//
+int32x2_t test_pand_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_pand_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pand_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pand_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[AND_I1]]
+//
+uint32x2_t test_pand_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_pand_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_por_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[OR_I1]]
+//
+int8x8_t test_por_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_por_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_por_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[OR_I1]]
+//
+uint8x8_t test_por_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_por_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_por_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[OR_I1]]
+//
+int16x4_t test_por_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_por_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_por_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[OR_I1]]
+//
+uint16x4_t test_por_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_por_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_por_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[OR_I1]]
+//
+int32x2_t test_por_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_por_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_por_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[OR_I1]]
+//
+uint32x2_t test_por_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_por_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[XOR_I1]]
+//
+int8x8_t test_pxor_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_pxor_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[XOR_I1]]
+//
+uint8x8_t test_pxor_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_pxor_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[XOR_I1]]
+//
+int16x4_t test_pxor_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_pxor_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[XOR_I1]]
+//
+uint16x4_t test_pxor_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_pxor_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[XOR_I1]]
+//
+int32x2_t test_pxor_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_pxor_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT: ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT: ret i64 [[XOR_I1]]
+//
+uint32x2_t test_pxor_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_pxor_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pnot_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[TMP0]], splat (i8 -1)
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[NOT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pnot_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[TMP0]], splat (i8 -1)
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[NOT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+int8x8_t test_pnot_i8x8(int8x8_t a) {
+ return __riscv_pnot_i8x8(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pnot_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[TMP0]], splat (i8 -1)
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[NOT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pnot_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[TMP0]], splat (i8 -1)
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[NOT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+uint8x8_t test_pnot_u8x8(uint8x8_t a) {
+ return __riscv_pnot_u8x8(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pnot_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[TMP0]], splat (i16 -1)
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[NOT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pnot_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[TMP0]], splat (i16 -1)
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[NOT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+int16x4_t test_pnot_i16x4(int16x4_t a) {
+ return __riscv_pnot_i16x4(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pnot_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[TMP0]], splat (i16 -1)
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[NOT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pnot_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[TMP0]], splat (i16 -1)
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[NOT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+uint16x4_t test_pnot_u16x4(uint16x4_t a) {
+ return __riscv_pnot_u16x4(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pnot_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[TMP0]], splat (i32 -1)
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[NOT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pnot_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[TMP0]], splat (i32 -1)
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[NOT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+int32x2_t test_pnot_i32x2(int32x2_t a) {
+ return __riscv_pnot_i32x2(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pnot_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT: [[ENTRY:.*:]]
+// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[TMP0]], splat (i32 -1)
+// RV32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[NOT_I]] to i64
+// RV32-NEXT: ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pnot_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT: [[ENTRY:.*:]]
+// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[TMP0]], splat (i32 -1)
+// RV64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[NOT_I]] to i64
+// RV64-NEXT: ret i64 [[TMP1]]
+//
+uint32x2_t test_pnot_u32x2(uint32x2_t a) {
+ return __riscv_pnot_u32x2(a);
+}
diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
new file mode 100644
index 0000000000000..288780e1252c0
--- /dev/null
+++ b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
@@ -0,0 +1,1198 @@
+// REQUIRES: riscv-registered-target
+// expected-no-diagnostics
+
+// RUN: %clang %s -O2 -S -o - --target=riscv32 \
+// RUN: -menable-experimental-extensions -march=rv32i_p0p21 \
+// RUN: -Werror -Wextra -Xclang -verify \
+// RUN: | FileCheck %s --check-prefixes=CHECK,RV32
+// RUN: %clang %s -O2 -S -o - --target=riscv64 \
+// RUN: -menable-experimental-extensions -march=rv64i_p0p21 \
+// RUN: -Werror -Wextra -Xclang -verify \
+// RUN: | FileCheck %s --check-prefixes=CHECK,RV64
+
+#include <riscv_packed_simd.h>
+
+// CHECK-LABEL: test_pmv_s_u8x4:
+// CHECK: pmv.bs
+uint8x4_t test_pmv_s_u8x4(uint8_t x) { return __riscv_pmv_s_u8x4(x); }
+
+// CHECK-LABEL: test_pmv_s_i8x4:
+// CHECK: pmv.bs
+int8x4_t test_pmv_s_i8x4(int8_t x) { return __riscv_pmv_s_i8x4(x); }
+
+// CHECK-LABEL: test_pmv_s_u16x2:
+// CHECK: pmv.hs
+uint16x2_t test_pmv_s_u16x2(uint16_t x) { return __riscv_pmv_s_u16x2(x); }
+
+// CHECK-LABEL: test_pmv_s_i16x2:
+// CHECK: pmv.hs
+int16x2_t test_pmv_s_i16x2(int16_t x) { return __riscv_pmv_s_i16x2(x); }
+
+// TODO: On RV64, the 32-bit packed constant splat emits `lui`+`addi` instead
+// of `pli.b`/`pli.h` or `plui.h`.
+// CHECK-LABEL: test_pmv_s_u8x4_imm:
+// RV32: pli.b
+// RV64: lui
+int8x4_t test_pmv_s_u8x4_imm(void) { return __riscv_pmv_s_u8x4(5); }
+
+// CHECK-LABEL: test_pmv_s_i8x4_imm:
+// RV32: pli.b
+// RV64: lui
+int8x4_t test_pmv_s_i8x4_imm(void) { return __riscv_pmv_s_i8x4(-3); }
+
+// CHECK-LABEL: test_pmv_s_u16x2_imm:
+// RV32: pli.h
+// RV64: lui
+uint16x2_t test_pmv_s_u16x2_imm(void) { return __riscv_pmv_s_u16x2(42); }
+
+// CHECK-LABEL: test_pmv_s_i16x2_imm:
+// RV32: pli.h
+// RV64: lui
+int16x2_t test_pmv_s_i16x2_imm(void) { return __riscv_pmv_s_i16x2(-5); }
+
+// CHECK-LABEL: test_pmv_s_u16x2_imm_hi:
+// RV32: plui.h
+// RV64: lui
+uint16x2_t test_pmv_s_u16x2_imm_hi(void) { return __riscv_pmv_s_u16x2(0x3600); }
+
+// CHECK-LABEL: test_pmv_s_i16x2_imm_hi:
+// RV32: plui.h
+// RV64: lui
+int16x2_t test_pmv_s_i16x2_imm_hi(void) { return __riscv_pmv_s_i16x2(0x3600); }
+
+// CHECK-LABEL: test_pmv_s_u8x8:
+// RV32: pmv.dbs
+// RV64: pmv.bs
+uint8x8_t test_pmv_s_u8x8(uint8_t x) { return __riscv_pmv_s_u8x8(x); }
+
+// CHECK-LABEL: test_pmv_s_i8x8:
+// RV32: pmv.dbs
+// RV64: pmv.bs
+int8x8_t test_pmv_s_i8x8(int8_t x) { return __riscv_pmv_s_i8x8(x); }
+
+// CHECK-LABEL: test_pmv_s_u16x4:
+// RV32: pmv.dhs
+// RV64: pmv.hs
+uint16x4_t test_pmv_s_u16x4(uint16_t x) { return __riscv_pmv_s_u16x4(x); }
+
+// CHECK-LABEL: test_pmv_s_i16x4:
+// RV32: pmv.dhs
+// RV64: pmv.hs
+int16x4_t test_pmv_s_i16x4(int16_t x) { return __riscv_pmv_s_i16x4(x); }
+
+// TODO: On RV32, the 32x2 variable splat emits a plain `mv` instead of
+// `padd.dws` with rs1_p=x0.
+// CHECK-LABEL: test_pmv_s_u32x2:
+// RV32: mv{{[[:space:]]}}
+// RV64: pmv.ws
+uint32x2_t test_pmv_s_u32x2(uint32_t x) { return __riscv_pmv_s_u32x2(x); }
+
+// CHECK-LABEL: test_pmv_s_i32x2:
+// RV32: mv{{[[:space:]]}}
+// RV64: pmv.ws
+int32x2_t test_pmv_s_i32x2(int32_t x) { return __riscv_pmv_s_i32x2(x); }
+
+// CHECK-LABEL: test_pmv_s_u8x8_imm:
+// RV32: pli.db
+// RV64: pli.b
+uint8x8_t test_pmv_s_u8x8_imm(void) { return __riscv_pmv_s_u8x8(5); }
+
+// CHECK-LABEL: test_pmv_s_i8x8_imm:
+// RV32: pli.db
+// RV64: pli.b
+int8x8_t test_pmv_s_i8x8_imm(void) { return __riscv_pmv_s_i8x8(-3); }
+
+// CHECK-LABEL: test_pmv_s_u16x4_imm:
+// RV32: pli.dh
+// RV64: pli.h
+uint16x4_t test_pmv_s_u16x4_imm(void) { return __riscv_pmv_s_u16x4(42); }
+
+// CHECK-LABEL: test_pmv_s_i16x4_imm:
+// RV32: pli.dh
+// RV64: pli.h
+int16x4_t test_pmv_s_i16x4_imm(void) { return __riscv_pmv_s_i16x4(-5); }
+
+// CHECK-LABEL: test_pmv_s_u16x4_imm_hi:
+// RV32: plui.dh
+// RV64: plui.h
+uint16x4_t test_pmv_s_u16x4_imm_hi(void) { return __riscv_pmv_s_u16x4(0x3600); }
+
+// CHECK-LABEL: test_pmv_s_i16x4_imm_hi:
+// RV32: plui.dh
+// RV64: plui.h
+int16x4_t test_pmv_s_i16x4_imm_hi(void) { return __riscv_pmv_s_i16x4(0x3600); }
+
+// Note: Constants that fit `addi`'s 12-bit immediate fold to 2x `li`.
+// Larger constants follow `lui`+`addi`+`mv`; see `_imm_big` below.
+// CHECK-LABEL: test_pmv_s_u32x2_imm:
+// RV32-COUNT-2: li{{[[:space:]]}}
+// RV64: pli.w
+uint32x2_t test_pmv_s_u32x2_imm(void) { return __riscv_pmv_s_u32x2(42); }
+
+// CHECK-LABEL: test_pmv_s_i32x2_imm:
+// RV32-COUNT-2: li{{[[:space:]]}}
+// RV64: pli.w
+int32x2_t test_pmv_s_i32x2_imm(void) { return __riscv_pmv_s_i32x2(-5); }
+
+// CHECK-LABEL: test_pmv_s_u32x2_imm_big:
+// RV32: lui
+// RV32-NEXT: addi
+// RV32-NEXT: mv{{[[:space:]]}}
+// RV32-NEXT: ret
+uint32x2_t test_pmv_s_u32x2_imm_big(void) {
+ return __riscv_pmv_s_u32x2(0x12345);
+}
+
+// CHECK-LABEL: test_pmv_s_i32x2_imm_big:
+// RV32: lui
+// RV32-NEXT: addi
+// RV32-NEXT: mv{{[[:space:]]}}
+// RV32-NEXT: ret
+int32x2_t test_pmv_s_i32x2_imm_big(void) {
+ return __riscv_pmv_s_i32x2(0x12345);
+}
+
+// CHECK-LABEL: test_padd_i8x4:
+// CHECK: padd.b
+int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_padd_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_u8x4:
+// CHECK: padd.b
+uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_padd_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_i16x2:
+// CHECK: padd.h
+int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_padd_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_u16x2:
+// CHECK: padd.h
+uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_padd_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_psub_i8x4:
+// CHECK: psub.b
+int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_psub_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_psub_u8x4:
+// CHECK: psub.b
+uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_psub_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_psub_i16x2:
+// CHECK: psub.h
+int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_psub_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_psub_u16x2:
+// CHECK: psub.h
+uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_psub_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pneg_i8x4:
+// CHECK: pneg.b
+int8x4_t test_pneg_i8x4(int8x4_t a) { return __riscv_pneg_i8x4(a); }
+
+// CHECK-LABEL: test_pneg_i16x2:
+// CHECK: pneg.h
+int16x2_t test_pneg_i16x2(int16x2_t a) { return __riscv_pneg_i16x2(a); }
+
+// CHECK-LABEL: test_padd_i8x8:
+// RV32: padd.db
+// RV64: padd.b
+int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_padd_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_u8x8:
+// RV32: padd.db
+// RV64: padd.b
+uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_padd_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_i16x4:
+// RV32: padd.dh
+// RV64: padd.h
+int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_padd_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_u16x4:
+// RV32: padd.dh
+// RV64: padd.h
+uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_padd_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_i32x2:
+// RV32: padd.dw
+// RV64: padd.w
+int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_padd_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_u32x2:
+// RV32: padd.dw
+// RV64: padd.w
+uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_padd_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_psub_i8x8:
+// RV32: psub.db
+// RV64: psub.b
+int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_psub_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_psub_u8x8:
+// RV32: psub.db
+// RV64: psub.b
+uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_psub_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_psub_i16x4:
+// RV32: psub.dh
+// RV64: psub.h
+int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_psub_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_psub_u16x4:
+// RV32: psub.dh
+// RV64: psub.h
+uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_psub_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_psub_i32x2:
+// RV32: psub.dw
+// RV64: psub.w
+int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_psub_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_psub_u32x2:
+// RV32: psub.dw
+// RV64: psub.w
+uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_psub_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pneg_i8x8:
+// RV32: pneg.db
+// RV64: pneg.b
+int8x8_t test_pneg_i8x8(int8x8_t a) { return __riscv_pneg_i8x8(a); }
+
+// CHECK-LABEL: test_pneg_i16x4:
+// RV32: pneg.dh
+// RV64: pneg.h
+int16x4_t test_pneg_i16x4(int16x4_t a) { return __riscv_pneg_i16x4(a); }
+
+// CHECK-LABEL: test_pneg_i32x2:
+// RV32: pneg.dw
+// RV64: pneg.w
+int32x2_t test_pneg_i32x2(int32x2_t a) { return __riscv_pneg_i32x2(a); }
+
+// CHECK-LABEL: test_padd_s_u8x4:
+// CHECK: padd.bs
+uint8x4_t test_padd_s_u8x4(uint8x4_t a, uint8_t b) {
+ return __riscv_padd_s_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i8x4:
+// CHECK: padd.bs
+int8x4_t test_padd_s_i8x4(int8x4_t a, int8_t b) {
+ return __riscv_padd_s_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u16x2:
+// CHECK: padd.hs
+uint16x2_t test_padd_s_u16x2(uint16x2_t a, uint16_t b) {
+ return __riscv_padd_s_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i16x2:
+// CHECK: padd.hs
+int16x2_t test_padd_s_i16x2(int16x2_t a, int16_t b) {
+ return __riscv_padd_s_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u8x8:
+// RV32: padd.dbs
+// RV64: padd.bs
+uint8x8_t test_padd_s_u8x8(uint8x8_t a, uint8_t b) {
+ return __riscv_padd_s_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i8x8:
+// RV32: padd.dbs
+// RV64: padd.bs
+int8x8_t test_padd_s_i8x8(int8x8_t a, int8_t b) {
+ return __riscv_padd_s_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u16x4:
+// RV32: padd.dhs
+// RV64: padd.hs
+uint16x4_t test_padd_s_u16x4(uint16x4_t a, uint16_t b) {
+ return __riscv_padd_s_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i16x4:
+// RV32: padd.dhs
+// RV64: padd.hs
+int16x4_t test_padd_s_i16x4(int16x4_t a, int16_t b) {
+ return __riscv_padd_s_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u32x2:
+// RV32: padd.dws
+// RV64: padd.ws
+uint32x2_t test_padd_s_u32x2(uint32x2_t a, uint32_t b) {
+ return __riscv_padd_s_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i32x2:
+// RV32: padd.dws
+// RV64: padd.ws
+int32x2_t test_padd_s_i32x2(int32x2_t a, int32_t b) {
+ return __riscv_padd_s_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i8x4:
+// CHECK: psadd.b
+int8x4_t test_psadd_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_psadd_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i16x2:
+// CHECK: psadd.h
+int16x2_t test_psadd_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_psadd_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u8x4:
+// CHECK: psaddu.b
+uint8x4_t test_psaddu_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_psaddu_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u16x2:
+// CHECK: psaddu.h
+uint16x2_t test_psaddu_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_psaddu_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i8x4:
+// CHECK: pssub.b
+int8x4_t test_pssub_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_pssub_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i16x2:
+// CHECK: pssub.h
+int16x2_t test_pssub_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_pssub_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u8x4:
+// CHECK: pssubu.b
+uint8x4_t test_pssubu_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_pssubu_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u16x2:
+// CHECK: pssubu.h
+uint16x2_t test_pssubu_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_pssubu_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i8x8:
+// RV32: psadd.db
+// RV64: psadd.b
+int8x8_t test_psadd_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_psadd_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i16x4:
+// RV32: psadd.dh
+// RV64: psadd.h
+int16x4_t test_psadd_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_psadd_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i32x2:
+// RV32: psadd.dw
+// RV64: psadd.w
+int32x2_t test_psadd_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_psadd_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u8x8:
+// RV32: psaddu.db
+// RV64: psaddu.b
+uint8x8_t test_psaddu_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_psaddu_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u16x4:
+// RV32: psaddu.dh
+// RV64: psaddu.h
+uint16x4_t test_psaddu_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_psaddu_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u32x2:
+// RV32: psaddu.dw
+// RV64: psaddu.w
+uint32x2_t test_psaddu_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_psaddu_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i8x8:
+// RV32: pssub.db
+// RV64: pssub.b
+int8x8_t test_pssub_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_pssub_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i16x4:
+// RV32: pssub.dh
+// RV64: pssub.h
+int16x4_t test_pssub_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_pssub_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i32x2:
+// RV32: pssub.dw
+// RV64: pssub.w
+int32x2_t test_pssub_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_pssub_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u8x8:
+// RV32: pssubu.db
+// RV64: pssubu.b
+uint8x8_t test_pssubu_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_pssubu_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u16x4:
+// RV32: pssubu.dh
+// RV64: pssubu.h
+uint16x4_t test_pssubu_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_pssubu_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u32x2:
+// RV32: pssubu.dw
+// RV64: pssubu.w
+uint32x2_t test_pssubu_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_pssubu_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_i16x2:
+// CHECK: psh1add.h
+int16x2_t test_psh1add_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_psh1add_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_u16x2:
+// CHECK: psh1add.h
+uint16x2_t test_psh1add_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_psh1add_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pssh1sadd_i16x2:
+// CHECK: pssh1sadd.h
+int16x2_t test_pssh1sadd_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_pssh1sadd_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_i16x4:
+// RV32: psh1add.dh
+// RV64: psh1add.h
+int16x4_t test_psh1add_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_psh1add_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_u16x4:
+// RV32: psh1add.dh
+// RV64: psh1add.h
+uint16x4_t test_psh1add_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_psh1add_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_i32x2:
+// RV32: psh1add.dw
+// RV64: psh1add.w
+int32x2_t test_psh1add_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_psh1add_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_u32x2:
+// RV32: psh1add.dw
+// RV64: psh1add.w
+uint32x2_t test_psh1add_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_psh1add_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pssh1sadd_i16x4:
+// RV32: pssh1sadd.dh
+// RV64: pssh1sadd.h
+int16x4_t test_pssh1sadd_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_pssh1sadd_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pssh1sadd_i32x2:
+// RV32: pssh1sadd.dw
+// RV64: pssh1sadd.w
+int32x2_t test_pssh1sadd_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_pssh1sadd_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i8x4:
+// CHECK: pmin.b
+int8x4_t test_pmin_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_pmin_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i16x2:
+// CHECK: pmin.h
+int16x2_t test_pmin_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_pmin_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u8x4:
+// CHECK: pminu.b
+uint8x4_t test_pminu_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_pminu_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u16x2:
+// CHECK: pminu.h
+uint16x2_t test_pminu_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_pminu_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i8x4:
+// CHECK: pmax.b
+int8x4_t test_pmax_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_pmax_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i16x2:
+// CHECK: pmax.h
+int16x2_t test_pmax_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_pmax_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u8x4:
+// CHECK: pmaxu.b
+uint8x4_t test_pmaxu_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_pmaxu_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u16x2:
+// CHECK: pmaxu.h
+uint16x2_t test_pmaxu_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_pmaxu_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i8x8:
+// RV32: pmin.db
+// RV64: pmin.b
+int8x8_t test_pmin_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_pmin_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i16x4:
+// RV32: pmin.dh
+// RV64: pmin.h
+int16x4_t test_pmin_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_pmin_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i32x2:
+// RV32: pmin.dw
+// RV64: pmin.w
+int32x2_t test_pmin_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_pmin_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u8x8:
+// RV32: pminu.db
+// RV64: pminu.b
+uint8x8_t test_pminu_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_pminu_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u16x4:
+// RV32: pminu.dh
+// RV64: pminu.h
+uint16x4_t test_pminu_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_pminu_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u32x2:
+// RV32: pminu.dw
+// RV64: pminu.w
+uint32x2_t test_pminu_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_pminu_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i8x8:
+// RV32: pmax.db
+// RV64: pmax.b
+int8x8_t test_pmax_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_pmax_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i16x4:
+// RV32: pmax.dh
+// RV64: pmax.h
+int16x4_t test_pmax_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_pmax_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i32x2:
+// RV32: pmax.dw
+// RV64: pmax.w
+int32x2_t test_pmax_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_pmax_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u8x8:
+// RV32: pmaxu.db
+// RV64: pmaxu.b
+uint8x8_t test_pmaxu_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_pmaxu_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u16x4:
+// RV32: pmaxu.dh
+// RV64: pmaxu.h
+uint16x4_t test_pmaxu_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_pmaxu_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u32x2:
+// RV32: pmaxu.dw
+// RV64: pmaxu.w
+uint32x2_t test_pmaxu_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_pmaxu_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_psll_s_u8x4:
+// CHECK: psll.bs
+uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned n) {
+ return __riscv_psll_s_u8x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i8x4:
+// CHECK: psll.bs
+int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned n) {
+ return __riscv_psll_s_i8x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u16x2:
+// CHECK: psll.hs
+uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned n) {
+ return __riscv_psll_s_u16x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i16x2:
+// CHECK: psll.hs
+int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned n) {
+ return __riscv_psll_s_i16x2(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x4:
+// CHECK: psrl.bs
+uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned n) {
+ return __riscv_psrl_s_u8x4(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x2:
+// CHECK: psrl.hs
+uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned n) {
+ return __riscv_psrl_s_u16x2(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i8x4:
+// CHECK: psra.bs
+int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned n) {
+ return __riscv_psra_s_i8x4(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i16x2:
+// CHECK: psra.hs
+int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned n) {
+ return __riscv_psra_s_i16x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u8x4_imm:
+// CHECK: pslli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x4_t test_psll_s_u8x4_imm(uint8x4_t a) {
+ return __riscv_psll_s_u8x4(a, 2);
+}
+
+// CHECK-LABEL: test_psll_s_i8x4_imm:
+// CHECK: pslli.b{{[[:space:]]+}}{{.*}}, 3
+int8x4_t test_psll_s_i8x4_imm(int8x4_t a) { return __riscv_psll_s_i8x4(a, 3); }
+
+// CHECK-LABEL: test_psll_s_u16x2_imm:
+// CHECK: pslli.h{{[[:space:]]+}}{{.*}}, 5
+uint16x2_t test_psll_s_u16x2_imm(uint16x2_t a) {
+ return __riscv_psll_s_u16x2(a, 5);
+}
+
+// CHECK-LABEL: test_psll_s_i16x2_imm:
+// CHECK: pslli.h{{[[:space:]]+}}{{.*}}, 7
+int16x2_t test_psll_s_i16x2_imm(int16x2_t a) {
+ return __riscv_psll_s_i16x2(a, 7);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x4_imm:
+// CHECK: psrli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x4_t test_psrl_s_u8x4_imm(uint8x4_t a) {
+ return __riscv_psrl_s_u8x4(a, 2);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x2_imm:
+// CHECK: psrli.h{{[[:space:]]+}}{{.*}}, 3
+uint16x2_t test_psrl_s_u16x2_imm(uint16x2_t a) {
+ return __riscv_psrl_s_u16x2(a, 3);
+}
+
+// CHECK-LABEL: test_psra_s_i8x4_imm:
+// CHECK: psrai.b{{[[:space:]]+}}{{.*}}, 4
+int8x4_t test_psra_s_i8x4_imm(int8x4_t a) { return __riscv_psra_s_i8x4(a, 4); }
+
+// CHECK-LABEL: test_psra_s_i16x2_imm:
+// CHECK: psrai.h{{[[:space:]]+}}{{.*}}, 5
+int16x2_t test_psra_s_i16x2_imm(int16x2_t a) {
+ return __riscv_psra_s_i16x2(a, 5);
+}
+
+// CHECK-LABEL: test_psll_s_u8x8:
+// RV32: psll.dbs
+// RV64: psll.bs
+uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned n) {
+ return __riscv_psll_s_u8x8(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i8x8:
+// RV32: psll.dbs
+// RV64: psll.bs
+int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned n) {
+ return __riscv_psll_s_i8x8(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u16x4:
+// RV32: psll.dhs
+// RV64: psll.hs
+uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned n) {
+ return __riscv_psll_s_u16x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i16x4:
+// RV32: psll.dhs
+// RV64: psll.hs
+int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned n) {
+ return __riscv_psll_s_i16x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u32x2:
+// RV32: psll.dws
+// RV64: psll.ws
+uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned n) {
+ return __riscv_psll_s_u32x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i32x2:
+// RV32: psll.dws
+// RV64: psll.ws
+int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned n) {
+ return __riscv_psll_s_i32x2(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x8:
+// RV32: psrl.dbs
+// RV64: psrl.bs
+uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned n) {
+ return __riscv_psrl_s_u8x8(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x4:
+// RV32: psrl.dhs
+// RV64: psrl.hs
+uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned n) {
+ return __riscv_psrl_s_u16x4(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u32x2:
+// RV32: psrl.dws
+// RV64: psrl.ws
+uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned n) {
+ return __riscv_psrl_s_u32x2(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i8x8:
+// RV32: psra.dbs
+// RV64: psra.bs
+int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned n) {
+ return __riscv_psra_s_i8x8(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i16x4:
+// RV32: psra.dhs
+// RV64: psra.hs
+int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned n) {
+ return __riscv_psra_s_i16x4(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i32x2:
+// RV32: psra.dws
+// RV64: psra.ws
+int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned n) {
+ return __riscv_psra_s_i32x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u8x8_imm:
+// RV32: pslli.db{{[[:space:]]+}}{{.*}}, 2
+// RV64: pslli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x8_t test_psll_s_u8x8_imm(uint8x8_t a) {
+ return __riscv_psll_s_u8x8(a, 2);
+}
+
+// CHECK-LABEL: test_psll_s_i8x8_imm:
+// RV32: pslli.db{{[[:space:]]+}}{{.*}}, 3
+// RV64: pslli.b{{[[:space:]]+}}{{.*}}, 3
+int8x8_t test_psll_s_i8x8_imm(int8x8_t a) { return __riscv_psll_s_i8x8(a, 3); }
+
+// CHECK-LABEL: test_psll_s_u16x4_imm:
+// RV32: pslli.dh{{[[:space:]]+}}{{.*}}, 4
+// RV64: pslli.h{{[[:space:]]+}}{{.*}}, 4
+uint16x4_t test_psll_s_u16x4_imm(uint16x4_t a) {
+ return __riscv_psll_s_u16x4(a, 4);
+}
+
+// CHECK-LABEL: test_psll_s_i16x4_imm:
+// RV32: pslli.dh{{[[:space:]]+}}{{.*}}, 5
+// RV64: pslli.h{{[[:space:]]+}}{{.*}}, 5
+int16x4_t test_psll_s_i16x4_imm(int16x4_t a) {
+ return __riscv_psll_s_i16x4(a, 5);
+}
+
+// CHECK-LABEL: test_psll_s_u32x2_imm:
+// RV32: pslli.dw{{[[:space:]]+}}{{.*}}, 7
+// RV64: pslli.w{{[[:space:]]+}}{{.*}}, 7
+uint32x2_t test_psll_s_u32x2_imm(uint32x2_t a) {
+ return __riscv_psll_s_u32x2(a, 7);
+}
+
+// CHECK-LABEL: test_psll_s_i32x2_imm:
+// RV32: pslli.dw{{[[:space:]]+}}{{.*}}, 9
+// RV64: pslli.w{{[[:space:]]+}}{{.*}}, 9
+int32x2_t test_psll_s_i32x2_imm(int32x2_t a) {
+ return __riscv_psll_s_i32x2(a, 9);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x8_imm:
+// RV32: psrli.db{{[[:space:]]+}}{{.*}}, 2
+// RV64: psrli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x8_t test_psrl_s_u8x8_imm(uint8x8_t a) {
+ return __riscv_psrl_s_u8x8(a, 2);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x4_imm:
+// RV32: psrli.dh{{[[:space:]]+}}{{.*}}, 3
+// RV64: psrli.h{{[[:space:]]+}}{{.*}}, 3
+uint16x4_t test_psrl_s_u16x4_imm(uint16x4_t a) {
+ return __riscv_psrl_s_u16x4(a, 3);
+}
+
+// CHECK-LABEL: test_psrl_s_u32x2_imm:
+// RV32: psrli.dw{{[[:space:]]+}}{{.*}}, 5
+// RV64: psrli.w{{[[:space:]]+}}{{.*}}, 5
+uint32x2_t test_psrl_s_u32x2_imm(uint32x2_t a) {
+ return __riscv_psrl_s_u32x2(a, 5);
+}
+
+// CHECK-LABEL: test_psra_s_i8x8_imm:
+// RV32: psrai.db{{[[:space:]]+}}{{.*}}, 4
+// RV64: psrai.b{{[[:space:]]+}}{{.*}}, 4
+int8x8_t test_psra_s_i8x8_imm(int8x8_t a) { return __riscv_psra_s_i8x8(a, 4); }
+
+// CHECK-LABEL: test_psra_s_i16x4_imm:
+// RV32: psrai.dh{{[[:space:]]+}}{{.*}}, 5
+// RV64: psrai.h{{[[:space:]]+}}{{.*}}, 5
+int16x4_t test_psra_s_i16x4_imm(int16x4_t a) {
+ return __riscv_psra_s_i16x4(a, 5);
+}
+
+// CHECK-LABEL: test_psra_s_i32x2_imm:
+// RV32: psrai.dw{{[[:space:]]+}}{{.*}}, 11
+// RV64: psrai.w{{[[:space:]]+}}{{.*}}, 11
+int32x2_t test_psra_s_i32x2_imm(int32x2_t a) {
+ return __riscv_psra_s_i32x2(a, 11);
+}
+
+// CHECK-LABEL: test_pand_i8x4:
+// CHECK: and{{[[:space:]]}}
+int8x4_t test_pand_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_pand_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pand_u8x4:
+// CHECK: and{{[[:space:]]}}
+uint8x4_t test_pand_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_pand_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pand_i16x2:
+// CHECK: and{{[[:space:]]}}
+int16x2_t test_pand_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_pand_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pand_u16x2:
+// CHECK: and{{[[:space:]]}}
+uint16x2_t test_pand_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_pand_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_por_i8x4:
+// CHECK: or{{[[:space:]]}}
+int8x4_t test_por_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_por_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_por_u8x4:
+// CHECK: or{{[[:space:]]}}
+uint8x4_t test_por_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_por_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_por_i16x2:
+// CHECK: or{{[[:space:]]}}
+int16x2_t test_por_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_por_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_por_u16x2:
+// CHECK: or{{[[:space:]]}}
+uint16x2_t test_por_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_por_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i8x4:
+// CHECK: xor{{[[:space:]]}}
+int8x4_t test_pxor_i8x4(int8x4_t a, int8x4_t b) {
+ return __riscv_pxor_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u8x4:
+// CHECK: xor{{[[:space:]]}}
+uint8x4_t test_pxor_u8x4(uint8x4_t a, uint8x4_t b) {
+ return __riscv_pxor_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i16x2:
+// CHECK: xor{{[[:space:]]}}
+int16x2_t test_pxor_i16x2(int16x2_t a, int16x2_t b) {
+ return __riscv_pxor_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u16x2:
+// CHECK: xor{{[[:space:]]}}
+uint16x2_t test_pxor_u16x2(uint16x2_t a, uint16x2_t b) {
+ return __riscv_pxor_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pnot_i8x4:
+// CHECK: not{{[[:space:]]}}
+int8x4_t test_pnot_i8x4(int8x4_t a) { return __riscv_pnot_i8x4(a); }
+
+// CHECK-LABEL: test_pnot_u8x4:
+// CHECK: not{{[[:space:]]}}
+uint8x4_t test_pnot_u8x4(uint8x4_t a) { return __riscv_pnot_u8x4(a); }
+
+// CHECK-LABEL: test_pnot_i16x2:
+// CHECK: not{{[[:space:]]}}
+int16x2_t test_pnot_i16x2(int16x2_t a) { return __riscv_pnot_i16x2(a); }
+
+// CHECK-LABEL: test_pnot_u16x2:
+// CHECK: not{{[[:space:]]}}
+uint16x2_t test_pnot_u16x2(uint16x2_t a) { return __riscv_pnot_u16x2(a); }
+
+// CHECK-LABEL: test_pand_i8x8:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64: and{{[[:space:]]}}
+int8x8_t test_pand_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_pand_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pand_u8x8:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64: and{{[[:space:]]}}
+uint8x8_t test_pand_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_pand_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pand_i16x4:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64: and{{[[:space:]]}}
+int16x4_t test_pand_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_pand_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pand_u16x4:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64: and{{[[:space:]]}}
+uint16x4_t test_pand_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_pand_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pand_i32x2:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64: and{{[[:space:]]}}
+int32x2_t test_pand_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_pand_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pand_u32x2:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64: and{{[[:space:]]}}
+uint32x2_t test_pand_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_pand_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_por_i8x8:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64: or{{[[:space:]]}}
+int8x8_t test_por_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_por_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_por_u8x8:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64: or{{[[:space:]]}}
+uint8x8_t test_por_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_por_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_por_i16x4:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64: or{{[[:space:]]}}
+int16x4_t test_por_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_por_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_por_u16x4:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64: or{{[[:space:]]}}
+uint16x4_t test_por_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_por_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_por_i32x2:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64: or{{[[:space:]]}}
+int32x2_t test_por_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_por_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_por_u32x2:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64: or{{[[:space:]]}}
+uint32x2_t test_por_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_por_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i8x8:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64: xor{{[[:space:]]}}
+int8x8_t test_pxor_i8x8(int8x8_t a, int8x8_t b) {
+ return __riscv_pxor_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u8x8:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64: xor{{[[:space:]]}}
+uint8x8_t test_pxor_u8x8(uint8x8_t a, uint8x8_t b) {
+ return __riscv_pxor_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i16x4:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64: xor{{[[:space:]]}}
+int16x4_t test_pxor_i16x4(int16x4_t a, int16x4_t b) {
+ return __riscv_pxor_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u16x4:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64: xor{{[[:space:]]}}
+uint16x4_t test_pxor_u16x4(uint16x4_t a, uint16x4_t b) {
+ return __riscv_pxor_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i32x2:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64: xor{{[[:space:]]}}
+int32x2_t test_pxor_i32x2(int32x2_t a, int32x2_t b) {
+ return __riscv_pxor_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u32x2:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64: xor{{[[:space:]]}}
+uint32x2_t test_pxor_u32x2(uint32x2_t a, uint32x2_t b) {
+ return __riscv_pxor_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pnot_i8x8:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64: not{{[[:space:]]}}
+int8x8_t test_pnot_i8x8(int8x8_t a) { return __riscv_pnot_i8x8(a); }
+
+// CHECK-LABEL: test_pnot_u8x8:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64: not{{[[:space:]]}}
+uint8x8_t test_pnot_u8x8(uint8x8_t a) { return __riscv_pnot_u8x8(a); }
+
+// CHECK-LABEL: test_pnot_i16x4:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64: not{{[[:space:]]}}
+int16x4_t test_pnot_i16x4(int16x4_t a) { return __riscv_pnot_i16x4(a); }
+
+// CHECK-LABEL: test_pnot_u16x4:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64: not{{[[:space:]]}}
+uint16x4_t test_pnot_u16x4(uint16x4_t a) { return __riscv_pnot_u16x4(a); }
+
+// CHECK-LABEL: test_pnot_i32x2:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64: not{{[[:space:]]}}
+int32x2_t test_pnot_i32x2(int32x2_t a) { return __riscv_pnot_i32x2(a); }
+
+// CHECK-LABEL: test_pnot_u32x2:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64: not{{[[:space:]]}}
+uint32x2_t test_pnot_u32x2(uint32x2_t a) { return __riscv_pnot_u32x2(a); }
More information about the cfe-commits
mailing list