[clang] b000f90 - [RISCV] Add riscv_packed_simd.h for P extension intrinsics (#181115)

via cfe-commits cfe-commits at lists.llvm.org
Wed Jun 10 02:00:42 PDT 2026


Author: SiHuaN
Date: 2026-06-10T17:00:37+08:00
New Revision: b000f9032911f32c0e68e373e083ccc90aae0005

URL: https://github.com/llvm/llvm-project/commit/b000f9032911f32c0e68e373e083ccc90aae0005
DIFF: https://github.com/llvm/llvm-project/commit/b000f9032911f32c0e68e373e083ccc90aae0005.diff

LOG: [RISCV] Add riscv_packed_simd.h for P extension intrinsics (#181115)

Add `riscv_packed_simd.h` with initial RISC-V P extension intrinsics, covering:

- Packed Splat
- Packed Addition and Subtraction
- Packed Addition with Scalar
- Packed Saturating Addition and Subtraction
- Packed Shift-Add
- Packed Minimum and Maximum
- Packed Shifts
- Packed Logical Operations

The intrinsics are implemented as thin wrappers over standard C operators
and existing generic builtins (`__builtin_elementwise_add_sat` etc.), letting
the RISC-V backend lower the resulting `<N x iN>` IR to P-ext instructions.
No new clang builtins or `llvm.riscv.*` intrinsics are introduced.

Spec: https://github.com/riscv/riscv-p-spec/blob/master/P-ext-intrinsics.adoc

Added: 
    clang/lib/Headers/riscv_packed_simd.h
    clang/test/CodeGen/RISCV/rvp-intrinsics.c
    cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c

Modified: 
    clang/lib/Headers/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ce34f8b9410a7..439f2725168ba 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -140,6 +140,7 @@ set(riscv_files
   riscv_corev_alu.h
   riscv_mips.h
   riscv_nds.h
+  riscv_packed_simd.h
   sifive_vector.h
   )
 

diff  --git a/clang/lib/Headers/riscv_packed_simd.h b/clang/lib/Headers/riscv_packed_simd.h
new file mode 100644
index 0000000000000..828cb90f8034a
--- /dev/null
+++ b/clang/lib/Headers/riscv_packed_simd.h
@@ -0,0 +1,306 @@
+/*===---- riscv_packed_simd.h - RISC-V P intrinsics ------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_PACKED_SIMD_H
+#define __RISCV_PACKED_SIMD_H
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Packed SIMD Types */
+
+typedef int8_t int8x4_t __attribute__((__vector_size__(4)));
+typedef uint8_t uint8x4_t __attribute__((__vector_size__(4)));
+typedef int16_t int16x2_t __attribute__((__vector_size__(4)));
+typedef uint16_t uint16x2_t __attribute__((__vector_size__(4)));
+
+typedef int8_t int8x8_t __attribute__((__vector_size__(8)));
+typedef uint8_t uint8x8_t __attribute__((__vector_size__(8)));
+typedef int16_t int16x4_t __attribute__((__vector_size__(8)));
+typedef uint16_t uint16x4_t __attribute__((__vector_size__(8)));
+typedef int32_t int32x2_t __attribute__((__vector_size__(8)));
+typedef uint32_t uint32x2_t __attribute__((__vector_size__(8)));
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+#define __packed_splat2(ty, x) ((ty){(x), (x)})
+#define __packed_splat4(ty, x) ((ty){(x), (x), (x), (x)})
+#define __packed_splat8(ty, x) ((ty){(x), (x), (x), (x), (x), (x), (x), (x)})
+
+#define __packed_splat(name, ty, scalar_ty, splat)                             \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(scalar_ty __x) {      \
+    return splat(ty, __x);                                                     \
+  }
+
+#define __packed_shift(name, ty, op, mask)                                     \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1,             \
+                                                         unsigned __rs2) {     \
+    return __rs1 op(__rs2 & (mask));                                           \
+  }
+#define __packed_shift8(name, ty, op) __packed_shift(name, ty, op, 0x7)
+#define __packed_shift16(name, ty, op) __packed_shift(name, ty, op, 0xf)
+#define __packed_shift32(name, ty, op) __packed_shift(name, ty, op, 0x1f)
+
+#define __packed_scalar_binary_op(name, ty, scalar_ty, op, splat)              \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1,             \
+                                                         scalar_ty __rs2) {    \
+    return __rs1 op splat(ty, __rs2);                                          \
+  }
+
+#define __packed_binary_op(name, ty, op)                                       \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
+    return __rs1 op __rs2;                                                     \
+  }
+
+#define __packed_unary_op(name, ty, op)                                        \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1) {           \
+    return op __rs1;                                                           \
+  }
+
+#define __packed_binary_builtin(name, ty, builtin)                             \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
+    return builtin(__rs1, __rs2);                                              \
+  }
+
+#define __packed_sh1add(name, ty)                                              \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
+    return (__rs1 << 1) + __rs2;                                               \
+  }
+
+/* TODO: switch to sadd_sat(__builtin_elementwise_shl_sat(a, 1), b) once a
+ * generic elementwise shl_sat builtin exists. sadd_sat(a, a) is equivalent
+ * for signed types and the backend's saturating_shl1 PatFrags matches both
+ * shapes. */
+#define __packed_sh1sadd(name, ty)                                             \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
+    return __builtin_elementwise_add_sat(                                      \
+        __builtin_elementwise_add_sat(__rs1, __rs1), __rs2);                   \
+  }
+
+// clang-format off: macro call sites have no trailing semicolons, which
+// confuses clang-format into a deeply nested expression.
+
+/* Packed Splat (32-bit) */
+__packed_splat(pmv_s_u8x4, uint8x4_t, uint8_t, __packed_splat4)
+__packed_splat(pmv_s_i8x4, int8x4_t, int8_t, __packed_splat4)
+__packed_splat(pmv_s_u16x2, uint16x2_t, uint16_t, __packed_splat2)
+__packed_splat(pmv_s_i16x2, int16x2_t, int16_t, __packed_splat2)
+
+/* Packed Splat (64-bit) */
+__packed_splat(pmv_s_u8x8, uint8x8_t, uint8_t, __packed_splat8)
+__packed_splat(pmv_s_i8x8, int8x8_t, int8_t, __packed_splat8)
+__packed_splat(pmv_s_u16x4, uint16x4_t, uint16_t, __packed_splat4)
+__packed_splat(pmv_s_i16x4, int16x4_t, int16_t, __packed_splat4)
+__packed_splat(pmv_s_u32x2, uint32x2_t, uint32_t, __packed_splat2)
+__packed_splat(pmv_s_i32x2, int32x2_t, int32_t, __packed_splat2)
+
+/* Packed Addition and Subtraction (32-bit) */
+__packed_binary_op(padd_i8x4, int8x4_t, +)
+__packed_binary_op(padd_u8x4, uint8x4_t, +)
+__packed_binary_op(padd_i16x2, int16x2_t, +)
+__packed_binary_op(padd_u16x2, uint16x2_t, +)
+__packed_binary_op(psub_i8x4, int8x4_t, -)
+__packed_binary_op(psub_u8x4, uint8x4_t, -)
+__packed_binary_op(psub_i16x2, int16x2_t, -)
+__packed_binary_op(psub_u16x2, uint16x2_t, -)
+__packed_unary_op(pneg_i8x4, int8x4_t, -)
+__packed_unary_op(pneg_i16x2, int16x2_t, -)
+
+/* Packed Addition and Subtraction (64-bit) */
+__packed_binary_op(padd_i8x8, int8x8_t, +)
+__packed_binary_op(padd_u8x8, uint8x8_t, +)
+__packed_binary_op(padd_i16x4, int16x4_t, +)
+__packed_binary_op(padd_u16x4, uint16x4_t, +)
+__packed_binary_op(padd_i32x2, int32x2_t, +)
+__packed_binary_op(padd_u32x2, uint32x2_t, +)
+__packed_binary_op(psub_i8x8, int8x8_t, -)
+__packed_binary_op(psub_u8x8, uint8x8_t, -)
+__packed_binary_op(psub_i16x4, int16x4_t, -)
+__packed_binary_op(psub_u16x4, uint16x4_t, -)
+__packed_binary_op(psub_i32x2, int32x2_t, -)
+__packed_binary_op(psub_u32x2, uint32x2_t, -)
+__packed_unary_op(pneg_i8x8, int8x8_t, -)
+__packed_unary_op(pneg_i16x4, int16x4_t, -)
+__packed_unary_op(pneg_i32x2, int32x2_t, -)
+
+/* Packed Addition with Scalar (32-bit) */
+__packed_scalar_binary_op(padd_s_u8x4, uint8x4_t, uint8_t, +, __packed_splat4)
+__packed_scalar_binary_op(padd_s_i8x4, int8x4_t, int8_t, +, __packed_splat4)
+__packed_scalar_binary_op(padd_s_u16x2, uint16x2_t, uint16_t, +,
+                          __packed_splat2)
+__packed_scalar_binary_op(padd_s_i16x2, int16x2_t, int16_t, +,
+                          __packed_splat2)
+
+/* Packed Addition with Scalar (64-bit) */
+__packed_scalar_binary_op(padd_s_u8x8, uint8x8_t, uint8_t, +, __packed_splat8)
+__packed_scalar_binary_op(padd_s_i8x8, int8x8_t, int8_t, +, __packed_splat8)
+__packed_scalar_binary_op(padd_s_u16x4, uint16x4_t, uint16_t, +,
+                          __packed_splat4)
+__packed_scalar_binary_op(padd_s_i16x4, int16x4_t, int16_t, +,
+                          __packed_splat4)
+__packed_scalar_binary_op(padd_s_u32x2, uint32x2_t, uint32_t, +,
+                          __packed_splat2)
+__packed_scalar_binary_op(padd_s_i32x2, int32x2_t, int32_t, +,
+                          __packed_splat2)
+
+/* Packed Saturating Addition and Subtraction (32-bit) */
+__packed_binary_builtin(psadd_i8x4, int8x4_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psadd_i16x2, int16x2_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u8x4, uint8x4_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u16x2, uint16x2_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(pssub_i8x4, int8x4_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssub_i16x2, int16x2_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u8x4, uint8x4_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u16x2, uint16x2_t, __builtin_elementwise_sub_sat)
+
+/* Packed Saturating Addition and Subtraction (64-bit) */
+__packed_binary_builtin(psadd_i8x8, int8x8_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psadd_i16x4, int16x4_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psadd_i32x2, int32x2_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u8x8, uint8x8_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u16x4, uint16x4_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u32x2, uint32x2_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(pssub_i8x8, int8x8_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssub_i16x4, int16x4_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssub_i32x2, int32x2_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u8x8, uint8x8_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u16x4, uint16x4_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u32x2, uint32x2_t, __builtin_elementwise_sub_sat)
+
+/* Packed Shift-Add (32-bit) */
+__packed_sh1add(psh1add_i16x2, int16x2_t)
+__packed_sh1add(psh1add_u16x2, uint16x2_t)
+__packed_sh1sadd(pssh1sadd_i16x2, int16x2_t)
+
+/* Packed Shift-Add (64-bit) */
+__packed_sh1add(psh1add_i16x4, int16x4_t)
+__packed_sh1add(psh1add_u16x4, uint16x4_t)
+__packed_sh1add(psh1add_i32x2, int32x2_t)
+__packed_sh1add(psh1add_u32x2, uint32x2_t)
+__packed_sh1sadd(pssh1sadd_i16x4, int16x4_t)
+__packed_sh1sadd(pssh1sadd_i32x2, int32x2_t)
+
+/* Packed Minimum and Maximum (32-bit) */
+__packed_binary_builtin(pmin_i8x4, int8x4_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmin_i16x2, int16x2_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u8x4, uint8x4_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u16x2, uint16x2_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmax_i8x4, int8x4_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmax_i16x2, int16x2_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u8x4, uint8x4_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u16x2, uint16x2_t, __builtin_elementwise_max)
+
+/* Packed Minimum and Maximum (64-bit) */
+__packed_binary_builtin(pmin_i8x8, int8x8_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmin_i16x4, int16x4_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmin_i32x2, int32x2_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u8x8, uint8x8_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u16x4, uint16x4_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u32x2, uint32x2_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmax_i8x8, int8x8_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmax_i16x4, int16x4_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmax_i32x2, int32x2_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u8x8, uint8x8_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u16x4, uint16x4_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u32x2, uint32x2_t, __builtin_elementwise_max)
+
+/* Packed Shifts (32-bit) */
+__packed_shift8(psll_s_u8x4, uint8x4_t, <<)
+__packed_shift8(psll_s_i8x4, int8x4_t, <<)
+__packed_shift16(psll_s_u16x2, uint16x2_t, <<)
+__packed_shift16(psll_s_i16x2, int16x2_t, <<)
+__packed_shift8(psrl_s_u8x4, uint8x4_t, >>)
+__packed_shift16(psrl_s_u16x2, uint16x2_t, >>)
+__packed_shift8(psra_s_i8x4, int8x4_t, >>)
+__packed_shift16(psra_s_i16x2, int16x2_t, >>)
+
+/* Packed Shifts (64-bit) */
+__packed_shift8(psll_s_u8x8, uint8x8_t, <<)
+__packed_shift8(psll_s_i8x8, int8x8_t, <<)
+__packed_shift16(psll_s_u16x4, uint16x4_t, <<)
+__packed_shift16(psll_s_i16x4, int16x4_t, <<)
+__packed_shift32(psll_s_u32x2, uint32x2_t, <<)
+__packed_shift32(psll_s_i32x2, int32x2_t, <<)
+__packed_shift8(psrl_s_u8x8, uint8x8_t, >>)
+__packed_shift16(psrl_s_u16x4, uint16x4_t, >>)
+__packed_shift32(psrl_s_u32x2, uint32x2_t, >>)
+__packed_shift8(psra_s_i8x8, int8x8_t, >>)
+__packed_shift16(psra_s_i16x4, int16x4_t, >>)
+__packed_shift32(psra_s_i32x2, int32x2_t, >>)
+
+/* Packed Logical Operations (32-bit) */
+__packed_binary_op(pand_i8x4, int8x4_t, &)
+__packed_binary_op(pand_u8x4, uint8x4_t, &)
+__packed_binary_op(pand_i16x2, int16x2_t, &)
+__packed_binary_op(pand_u16x2, uint16x2_t, &)
+__packed_binary_op(por_i8x4, int8x4_t, |)
+__packed_binary_op(por_u8x4, uint8x4_t, |)
+__packed_binary_op(por_i16x2, int16x2_t, |)
+__packed_binary_op(por_u16x2, uint16x2_t, |)
+__packed_binary_op(pxor_i8x4, int8x4_t, ^)
+__packed_binary_op(pxor_u8x4, uint8x4_t, ^)
+__packed_binary_op(pxor_i16x2, int16x2_t, ^)
+__packed_binary_op(pxor_u16x2, uint16x2_t, ^)
+__packed_unary_op(pnot_i8x4, int8x4_t, ~)
+__packed_unary_op(pnot_u8x4, uint8x4_t, ~)
+__packed_unary_op(pnot_i16x2, int16x2_t, ~)
+__packed_unary_op(pnot_u16x2, uint16x2_t, ~)
+
+/* Packed Logical Operations (64-bit) */
+__packed_binary_op(pand_i8x8, int8x8_t, &)
+__packed_binary_op(pand_u8x8, uint8x8_t, &)
+__packed_binary_op(pand_i16x4, int16x4_t, &)
+__packed_binary_op(pand_u16x4, uint16x4_t, &)
+__packed_binary_op(pand_i32x2, int32x2_t, &)
+__packed_binary_op(pand_u32x2, uint32x2_t, &)
+__packed_binary_op(por_i8x8, int8x8_t, |)
+__packed_binary_op(por_u8x8, uint8x8_t, |)
+__packed_binary_op(por_i16x4, int16x4_t, |)
+__packed_binary_op(por_u16x4, uint16x4_t, |)
+__packed_binary_op(por_i32x2, int32x2_t, |)
+__packed_binary_op(por_u32x2, uint32x2_t, |)
+__packed_binary_op(pxor_i8x8, int8x8_t, ^)
+__packed_binary_op(pxor_u8x8, uint8x8_t, ^)
+__packed_binary_op(pxor_i16x4, int16x4_t, ^)
+__packed_binary_op(pxor_u16x4, uint16x4_t, ^)
+__packed_binary_op(pxor_i32x2, int32x2_t, ^)
+__packed_binary_op(pxor_u32x2, uint32x2_t, ^)
+__packed_unary_op(pnot_i8x8, int8x8_t, ~)
+__packed_unary_op(pnot_u8x8, uint8x8_t, ~)
+__packed_unary_op(pnot_i16x4, int16x4_t, ~)
+__packed_unary_op(pnot_u16x4, uint16x4_t, ~)
+__packed_unary_op(pnot_i32x2, int32x2_t, ~)
+__packed_unary_op(pnot_u32x2, uint32x2_t, ~)
+
+// clang-format on
+
+#undef __packed_splat2
+#undef __packed_splat4
+#undef __packed_splat8
+#undef __packed_splat
+#undef __packed_shift
+#undef __packed_shift8
+#undef __packed_shift16
+#undef __packed_shift32
+#undef __packed_scalar_binary_op
+#undef __packed_binary_op
+#undef __packed_unary_op
+#undef __packed_binary_builtin
+#undef __packed_sh1add
+#undef __packed_sh1sadd
+#undef __DEFAULT_FN_ATTRS
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __RISCV_PACKED_SIMD_H */

diff  --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
new file mode 100644
index 0000000000000..73db0bee19def
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -0,0 +1,3349 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple riscv32 -target-feature +experimental-p \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=sroa,instcombine | FileCheck %s --check-prefix=RV32
+// RUN: %clang_cc1 -triple riscv64 -target-feature +experimental-p \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=sroa,instcombine | FileCheck %s --check-prefix=RV64
+
+#include <riscv_packed_simd.h>
+
+/* Packed Splat (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_pmv_s_u8x4(
+// RV32-SAME: i8 noundef zeroext [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[VECINIT3_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmv_s_u8x4(
+// RV64-SAME: i8 noundef zeroext [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[VECINIT3_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP0]]
+//
+uint8x4_t test_pmv_s_u8x4(uint8_t x) {
+  return __riscv_pmv_s_u8x4(x);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmv_s_i8x4(
+// RV32-SAME: i8 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[VECINIT3_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmv_s_i8x4(
+// RV64-SAME: i8 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[VECINIT3_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP0]]
+//
+int8x4_t test_pmv_s_i8x4(int8_t x) {
+  return __riscv_pmv_s_i8x4(x);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmv_s_u16x2(
+// RV32-SAME: i16 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i16> [[VECINIT1_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmv_s_u16x2(
+// RV64-SAME: i16 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i16> [[VECINIT1_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP0]]
+//
+uint16x2_t test_pmv_s_u16x2(uint16_t x) {
+  return __riscv_pmv_s_u16x2(x);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmv_s_i16x2(
+// RV32-SAME: i16 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i16> [[VECINIT1_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmv_s_i16x2(
+// RV64-SAME: i16 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i16> [[VECINIT1_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP0]]
+//
+int16x2_t test_pmv_s_i16x2(int16_t x) {
+  return __riscv_pmv_s_i16x2(x);
+}
+
+/* Packed Splat (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_u8x8(
+// RV32-SAME: i8 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[VECINIT7_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_u8x8(
+// RV64-SAME: i8 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[VECINIT7_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP0]]
+//
+uint8x8_t test_pmv_s_u8x8(uint8_t x) {
+  return __riscv_pmv_s_u8x8(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_i8x8(
+// RV32-SAME: i8 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[VECINIT7_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_i8x8(
+// RV64-SAME: i8 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[VECINIT7_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP0]]
+//
+int8x8_t test_pmv_s_i8x8(int8_t x) {
+  return __riscv_pmv_s_i8x8(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_u16x4(
+// RV32-SAME: i16 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_u16x4(
+// RV64-SAME: i16 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP0]]
+//
+uint16x4_t test_pmv_s_u16x4(uint16_t x) {
+  return __riscv_pmv_s_u16x4(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_i16x4(
+// RV32-SAME: i16 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_i16x4(
+// RV64-SAME: i16 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP0]]
+//
+int16x4_t test_pmv_s_i16x4(int16_t x) {
+  return __riscv_pmv_s_i16x4(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_u32x2(
+// RV32-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_u32x2(
+// RV64-SAME: i32 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP0]]
+//
+uint32x2_t test_pmv_s_u32x2(uint32_t x) {
+  return __riscv_pmv_s_u32x2(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_i32x2(
+// RV32-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_i32x2(
+// RV64-SAME: i32 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP0]]
+//
+int32x2_t test_pmv_s_i32x2(int32_t x) {
+  return __riscv_pmv_s_i32x2(x);
+}
+
+/* Packed Addition and Subtraction (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_padd_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_padd_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_padd_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_padd_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_psub_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_psub_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psub_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psub_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pneg_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pneg_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int8x4_t test_pneg_i8x4(int8x4_t a) {
+  return __riscv_pneg_i8x4(a);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pneg_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pneg_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int16x2_t test_pneg_i16x2(int16x2_t a) {
+  return __riscv_pneg_i16x2(a);
+}
+
+/* Packed Addition and Subtraction (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_padd_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_padd_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_padd_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_padd_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_padd_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_padd_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_psub_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_psub_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psub_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psub_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psub_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psub_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pneg_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pneg_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int8x8_t test_pneg_i8x8(int8x8_t a) {
+  return __riscv_pneg_i8x8(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pneg_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pneg_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int16x4_t test_pneg_i16x4(int16x4_t a) {
+  return __riscv_pneg_i16x4(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pneg_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pneg_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_pneg_i32x2(int32x2_t a) {
+  return __riscv_pneg_i32x2(a);
+}
+
+/* Packed Addition with Scalar (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint8x4_t test_padd_s_u8x4(uint8x4_t a, uint8_t b) {
+  return __riscv_padd_s_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int8x4_t test_padd_s_i8x4(int8x4_t a, int8_t b) {
+  return __riscv_padd_s_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[VECINIT2_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[VECINIT2_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint16x2_t test_padd_s_u16x2(uint16x2_t a, uint16_t b) {
+  return __riscv_padd_s_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[VECINIT2_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[VECINIT2_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int16x2_t test_padd_s_i16x2(int16x2_t a, int16_t b) {
+  return __riscv_padd_s_i16x2(a, b);
+}
+
+/* Packed Addition with Scalar (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[VECINIT8_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[VECINIT8_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint8x8_t test_padd_s_u8x8(uint8x8_t a, uint8_t b) {
+  return __riscv_padd_s_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[VECINIT8_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[VECINIT8_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int8x8_t test_padd_s_i8x8(int8x8_t a, int8_t b) {
+  return __riscv_padd_s_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint16x4_t test_padd_s_u16x4(uint16x4_t a, uint16_t b) {
+  return __riscv_padd_s_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int16x4_t test_padd_s_i16x4(int16x4_t a, int16_t b) {
+  return __riscv_padd_s_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[VECINIT2_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[VECINIT2_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint32x2_t test_padd_s_u32x2(uint32x2_t a, uint32_t b) {
+  return __riscv_padd_s_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[VECINIT2_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[VECINIT2_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_padd_s_i32x2(int32x2_t a, int32_t b) {
+  return __riscv_padd_s_i32x2(a, b);
+}
+
+/* Packed Saturating Addition and Subtraction (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_psadd_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psadd_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int8x4_t test_psadd_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_psadd_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psadd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psadd_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_psadd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psadd_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psaddu_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psaddu_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint8x4_t test_psaddu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_psaddu_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psaddu_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psaddu_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_psaddu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psaddu_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssub_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssub_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int8x4_t test_pssub_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pssub_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssub_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssub_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_pssub_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pssub_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssubu_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssubu_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint8x4_t test_pssubu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pssubu_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssubu_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssubu_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_pssubu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pssubu_u16x2(a, b);
+}
+
+/* Packed Saturating Addition and Subtraction (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_psadd_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psadd_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_psadd_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_psadd_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psadd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psadd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_psadd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psadd_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psadd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psadd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_psadd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psadd_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psaddu_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psaddu_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_psaddu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_psaddu_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psaddu_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psaddu_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_psaddu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psaddu_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psaddu_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psaddu_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_psaddu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psaddu_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssub_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssub_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_pssub_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pssub_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssub_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssub_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_pssub_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pssub_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssub_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssub_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_pssub_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pssub_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssubu_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssubu_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_pssubu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pssubu_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssubu_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssubu_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_pssubu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pssubu_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssubu_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssubu_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_pssubu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pssubu_u32x2(a, b);
+}
+
+/* Packed Shift-Add (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_psh1add_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psh1add_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_psh1add_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psh1add_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psh1add_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psh1add_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_psh1add_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psh1add_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssh1sadd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP0]])
+// RV32-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ELT_SAT_I]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT3_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssh1sadd_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP0]])
+// RV64-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ELT_SAT_I]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT3_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_pssh1sadd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pssh1sadd_i16x2(a, b);
+}
+
+/* Packed Shift-Add (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_psh1add_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psh1add_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_psh1add_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psh1add_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_psh1add_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psh1add_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_psh1add_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psh1add_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssh1sadd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP0]])
+// RV32-NEXT:    [[ELT_SAT3_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[ELT_SAT_I]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT3_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssh1sadd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP0]])
+// RV64-NEXT:    [[ELT_SAT3_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[ELT_SAT_I]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT3_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_pssh1sadd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pssh1sadd_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssh1sadd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP0]])
+// RV32-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ELT_SAT_I]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT3_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssh1sadd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP0]])
+// RV64-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ELT_SAT_I]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT3_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_pssh1sadd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pssh1sadd_i32x2(a, b);
+}
+
+/* Packed Minimum and Maximum (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_pmin_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmin_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int8x4_t test_pmin_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pmin_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmin_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmin_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_pmin_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pmin_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pminu_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pminu_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint8x4_t test_pminu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pminu_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pminu_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pminu_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_pminu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pminu_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmax_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmax_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int8x4_t test_pmax_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pmax_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmax_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmax_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_pmax_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pmax_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmaxu_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmaxu_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint8x4_t test_pmaxu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pmaxu_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmaxu_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmaxu_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_pmaxu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pmaxu_u16x2(a, b);
+}
+
+/* Packed Minimum and Maximum (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_pmin_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmin_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_pmin_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pmin_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmin_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmin_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_pmin_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pmin_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmin_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmin_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_pmin_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pmin_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pminu_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pminu_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_pminu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pminu_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pminu_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pminu_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_pminu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pminu_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pminu_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pminu_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_pminu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pminu_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmax_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmax_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_pmax_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pmax_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmax_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.smax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmax_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.smax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_pmax_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pmax_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmax_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmax_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_pmax_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pmax_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmaxu_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmaxu_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_pmaxu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pmaxu_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmaxu_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.umax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmaxu_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.umax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_pmaxu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pmaxu_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmaxu_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmaxu_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_pmaxu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pmaxu_u32x2(a, b);
+}
+
+/* Packed Shifts (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psll_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
+  return __riscv_psll_s_i8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psll_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
+  return __riscv_psll_s_u8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psll_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
+  return __riscv_psll_s_i16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psll_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
+  return __riscv_psll_s_u16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psra_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psra_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
+  return __riscv_psra_s_i8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psrl_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
+  return __riscv_psrl_s_u8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psra_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psra_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
+  return __riscv_psra_s_i16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psrl_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
+  return __riscv_psrl_s_u16x2(a, shamt);
+}
+
+/* Packed Shifts (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
+  return __riscv_psll_s_i8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
+  return __riscv_psll_s_u8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
+  return __riscv_psll_s_i16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
+  return __riscv_psll_s_u16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
+  return __riscv_psll_s_i32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned shamt) {
+  return __riscv_psll_s_u32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
+  return __riscv_psra_s_i8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
+  return __riscv_psrl_s_u8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
+  return __riscv_psra_s_i16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
+  return __riscv_psrl_s_u16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
+  return __riscv_psra_s_i32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned shamt) {
+  return __riscv_psrl_s_u32x2(a, shamt);
+}
+
+/* Packed Logical Operations (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_pand_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pand_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[AND_I1]]
+//
+int8x4_t test_pand_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pand_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pand_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pand_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[AND_I1]]
+//
+uint8x4_t test_pand_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pand_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pand_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pand_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[AND_I1]]
+//
+int16x2_t test_pand_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pand_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pand_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pand_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[AND_I1]]
+//
+uint16x2_t test_pand_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pand_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_por_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_por_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[OR_I1]]
+//
+int8x4_t test_por_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_por_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_por_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_por_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[OR_I1]]
+//
+uint8x4_t test_por_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_por_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_por_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_por_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[OR_I1]]
+//
+int16x2_t test_por_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_por_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_por_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_por_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[OR_I1]]
+//
+uint16x2_t test_por_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_por_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pxor_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pxor_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[XOR_I1]]
+//
+int8x4_t test_pxor_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pxor_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pxor_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pxor_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[XOR_I1]]
+//
+uint8x4_t test_pxor_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pxor_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pxor_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pxor_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[XOR_I1]]
+//
+int16x2_t test_pxor_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pxor_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pxor_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pxor_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[XOR_I1]]
+//
+uint16x2_t test_pxor_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pxor_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pnot_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <4 x i8> [[TMP0]], splat (i8 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[NOT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pnot_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <4 x i8> [[TMP0]], splat (i8 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[NOT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int8x4_t test_pnot_i8x4(int8x4_t a) {
+  return __riscv_pnot_i8x4(a);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pnot_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <4 x i8> [[TMP0]], splat (i8 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[NOT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pnot_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <4 x i8> [[TMP0]], splat (i8 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[NOT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint8x4_t test_pnot_u8x4(uint8x4_t a) {
+  return __riscv_pnot_u8x4(a);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pnot_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <2 x i16> [[TMP0]], splat (i16 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[NOT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pnot_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <2 x i16> [[TMP0]], splat (i16 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[NOT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int16x2_t test_pnot_i16x2(int16x2_t a) {
+  return __riscv_pnot_i16x2(a);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pnot_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <2 x i16> [[TMP0]], splat (i16 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[NOT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pnot_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <2 x i16> [[TMP0]], splat (i16 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[NOT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint16x2_t test_pnot_u16x2(uint16x2_t a) {
+  return __riscv_pnot_u16x2(a);
+}
+
+/* Packed Logical Operations (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_pand_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pand_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[AND_I1]]
+//
+int8x8_t test_pand_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pand_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pand_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pand_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[AND_I1]]
+//
+uint8x8_t test_pand_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pand_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pand_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pand_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[AND_I1]]
+//
+int16x4_t test_pand_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pand_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pand_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pand_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[AND_I1]]
+//
+uint16x4_t test_pand_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pand_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pand_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pand_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[AND_I1]]
+//
+int32x2_t test_pand_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pand_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pand_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pand_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[AND_I1]]
+//
+uint32x2_t test_pand_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pand_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_por_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[OR_I1]]
+//
+int8x8_t test_por_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_por_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_por_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[OR_I1]]
+//
+uint8x8_t test_por_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_por_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_por_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[OR_I1]]
+//
+int16x4_t test_por_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_por_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_por_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[OR_I1]]
+//
+uint16x4_t test_por_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_por_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_por_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[OR_I1]]
+//
+int32x2_t test_por_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_por_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_por_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[OR_I1]]
+//
+uint32x2_t test_por_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_por_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[XOR_I1]]
+//
+int8x8_t test_pxor_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pxor_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[XOR_I1]]
+//
+uint8x8_t test_pxor_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pxor_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[XOR_I1]]
+//
+int16x4_t test_pxor_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pxor_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[XOR_I1]]
+//
+uint16x4_t test_pxor_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pxor_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[XOR_I1]]
+//
+int32x2_t test_pxor_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pxor_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[XOR_I1]]
+//
+uint32x2_t test_pxor_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pxor_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pnot_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[TMP0]], splat (i8 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[NOT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pnot_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[TMP0]], splat (i8 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[NOT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int8x8_t test_pnot_i8x8(int8x8_t a) {
+  return __riscv_pnot_i8x8(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pnot_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[TMP0]], splat (i8 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[NOT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pnot_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[TMP0]], splat (i8 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[NOT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint8x8_t test_pnot_u8x8(uint8x8_t a) {
+  return __riscv_pnot_u8x8(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pnot_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[TMP0]], splat (i16 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[NOT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pnot_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[TMP0]], splat (i16 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[NOT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int16x4_t test_pnot_i16x4(int16x4_t a) {
+  return __riscv_pnot_i16x4(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pnot_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[TMP0]], splat (i16 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[NOT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pnot_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[TMP0]], splat (i16 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[NOT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint16x4_t test_pnot_u16x4(uint16x4_t a) {
+  return __riscv_pnot_u16x4(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pnot_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[TMP0]], splat (i32 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[NOT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pnot_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[TMP0]], splat (i32 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[NOT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_pnot_i32x2(int32x2_t a) {
+  return __riscv_pnot_i32x2(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pnot_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[TMP0]], splat (i32 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[NOT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pnot_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[TMP0]], splat (i32 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[NOT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint32x2_t test_pnot_u32x2(uint32x2_t a) {
+  return __riscv_pnot_u32x2(a);
+}

diff  --git a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
new file mode 100644
index 0000000000000..288780e1252c0
--- /dev/null
+++ b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
@@ -0,0 +1,1198 @@
+// REQUIRES: riscv-registered-target
+// expected-no-diagnostics
+
+// RUN: %clang %s -O2 -S -o - --target=riscv32 \
+// RUN:   -menable-experimental-extensions -march=rv32i_p0p21 \
+// RUN:   -Werror -Wextra -Xclang -verify \
+// RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+// RUN: %clang %s -O2 -S -o - --target=riscv64 \
+// RUN:   -menable-experimental-extensions -march=rv64i_p0p21 \
+// RUN:   -Werror -Wextra -Xclang -verify \
+// RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+#include <riscv_packed_simd.h>
+
+// CHECK-LABEL: test_pmv_s_u8x4:
+// CHECK:       pmv.bs
+uint8x4_t test_pmv_s_u8x4(uint8_t x) { return __riscv_pmv_s_u8x4(x); }
+
+// CHECK-LABEL: test_pmv_s_i8x4:
+// CHECK:       pmv.bs
+int8x4_t test_pmv_s_i8x4(int8_t x) { return __riscv_pmv_s_i8x4(x); }
+
+// CHECK-LABEL: test_pmv_s_u16x2:
+// CHECK:       pmv.hs
+uint16x2_t test_pmv_s_u16x2(uint16_t x) { return __riscv_pmv_s_u16x2(x); }
+
+// CHECK-LABEL: test_pmv_s_i16x2:
+// CHECK:       pmv.hs
+int16x2_t test_pmv_s_i16x2(int16_t x) { return __riscv_pmv_s_i16x2(x); }
+
+// TODO: On RV64, the 32-bit packed constant splat emits `lui`+`addi` instead
+// of `pli.b`/`pli.h` or `plui.h`.
+// CHECK-LABEL: test_pmv_s_u8x4_imm:
+// RV32:        pli.b
+// RV64:        lui
+int8x4_t test_pmv_s_u8x4_imm(void) { return __riscv_pmv_s_u8x4(5); }
+
+// CHECK-LABEL: test_pmv_s_i8x4_imm:
+// RV32:        pli.b
+// RV64:        lui
+int8x4_t test_pmv_s_i8x4_imm(void) { return __riscv_pmv_s_i8x4(-3); }
+
+// CHECK-LABEL: test_pmv_s_u16x2_imm:
+// RV32:        pli.h
+// RV64:        lui
+uint16x2_t test_pmv_s_u16x2_imm(void) { return __riscv_pmv_s_u16x2(42); }
+
+// CHECK-LABEL: test_pmv_s_i16x2_imm:
+// RV32:        pli.h
+// RV64:        lui
+int16x2_t test_pmv_s_i16x2_imm(void) { return __riscv_pmv_s_i16x2(-5); }
+
+// CHECK-LABEL: test_pmv_s_u16x2_imm_hi:
+// RV32:        plui.h
+// RV64:        lui
+uint16x2_t test_pmv_s_u16x2_imm_hi(void) { return __riscv_pmv_s_u16x2(0x3600); }
+
+// CHECK-LABEL: test_pmv_s_i16x2_imm_hi:
+// RV32:        plui.h
+// RV64:        lui
+int16x2_t test_pmv_s_i16x2_imm_hi(void) { return __riscv_pmv_s_i16x2(0x3600); }
+
+// CHECK-LABEL: test_pmv_s_u8x8:
+// RV32:        pmv.dbs
+// RV64:        pmv.bs
+uint8x8_t test_pmv_s_u8x8(uint8_t x) { return __riscv_pmv_s_u8x8(x); }
+
+// CHECK-LABEL: test_pmv_s_i8x8:
+// RV32:        pmv.dbs
+// RV64:        pmv.bs
+int8x8_t test_pmv_s_i8x8(int8_t x) { return __riscv_pmv_s_i8x8(x); }
+
+// CHECK-LABEL: test_pmv_s_u16x4:
+// RV32:        pmv.dhs
+// RV64:        pmv.hs
+uint16x4_t test_pmv_s_u16x4(uint16_t x) { return __riscv_pmv_s_u16x4(x); }
+
+// CHECK-LABEL: test_pmv_s_i16x4:
+// RV32:        pmv.dhs
+// RV64:        pmv.hs
+int16x4_t test_pmv_s_i16x4(int16_t x) { return __riscv_pmv_s_i16x4(x); }
+
+// TODO: On RV32, the 32x2 variable splat emits a plain `mv` instead of
+// `padd.dws` with rs1_p=x0.
+// CHECK-LABEL: test_pmv_s_u32x2:
+// RV32:        mv{{[[:space:]]}}
+// RV64:        pmv.ws
+uint32x2_t test_pmv_s_u32x2(uint32_t x) { return __riscv_pmv_s_u32x2(x); }
+
+// CHECK-LABEL: test_pmv_s_i32x2:
+// RV32:        mv{{[[:space:]]}}
+// RV64:        pmv.ws
+int32x2_t test_pmv_s_i32x2(int32_t x) { return __riscv_pmv_s_i32x2(x); }
+
+// CHECK-LABEL: test_pmv_s_u8x8_imm:
+// RV32:        pli.db
+// RV64:        pli.b
+uint8x8_t test_pmv_s_u8x8_imm(void) { return __riscv_pmv_s_u8x8(5); }
+
+// CHECK-LABEL: test_pmv_s_i8x8_imm:
+// RV32:        pli.db
+// RV64:        pli.b
+int8x8_t test_pmv_s_i8x8_imm(void) { return __riscv_pmv_s_i8x8(-3); }
+
+// CHECK-LABEL: test_pmv_s_u16x4_imm:
+// RV32:        pli.dh
+// RV64:        pli.h
+uint16x4_t test_pmv_s_u16x4_imm(void) { return __riscv_pmv_s_u16x4(42); }
+
+// CHECK-LABEL: test_pmv_s_i16x4_imm:
+// RV32:        pli.dh
+// RV64:        pli.h
+int16x4_t test_pmv_s_i16x4_imm(void) { return __riscv_pmv_s_i16x4(-5); }
+
+// CHECK-LABEL: test_pmv_s_u16x4_imm_hi:
+// RV32:        plui.dh
+// RV64:        plui.h
+uint16x4_t test_pmv_s_u16x4_imm_hi(void) { return __riscv_pmv_s_u16x4(0x3600); }
+
+// CHECK-LABEL: test_pmv_s_i16x4_imm_hi:
+// RV32:        plui.dh
+// RV64:        plui.h
+int16x4_t test_pmv_s_i16x4_imm_hi(void) { return __riscv_pmv_s_i16x4(0x3600); }
+
+// Note: Constants that fit `addi`'s 12-bit immediate fold to 2x `li`.
+// Larger constants follow `lui`+`addi`+`mv`; see `_imm_big` below.
+// CHECK-LABEL: test_pmv_s_u32x2_imm:
+// RV32-COUNT-2: li{{[[:space:]]}}
+// RV64:         pli.w
+uint32x2_t test_pmv_s_u32x2_imm(void) { return __riscv_pmv_s_u32x2(42); }
+
+// CHECK-LABEL: test_pmv_s_i32x2_imm:
+// RV32-COUNT-2: li{{[[:space:]]}}
+// RV64:         pli.w
+int32x2_t test_pmv_s_i32x2_imm(void) { return __riscv_pmv_s_i32x2(-5); }
+
+// CHECK-LABEL: test_pmv_s_u32x2_imm_big:
+// RV32:        lui
+// RV32-NEXT:   addi
+// RV32-NEXT:   mv{{[[:space:]]}}
+// RV32-NEXT:   ret
+uint32x2_t test_pmv_s_u32x2_imm_big(void) {
+  return __riscv_pmv_s_u32x2(0x12345);
+}
+
+// CHECK-LABEL: test_pmv_s_i32x2_imm_big:
+// RV32:        lui
+// RV32-NEXT:   addi
+// RV32-NEXT:   mv{{[[:space:]]}}
+// RV32-NEXT:   ret
+int32x2_t test_pmv_s_i32x2_imm_big(void) {
+  return __riscv_pmv_s_i32x2(0x12345);
+}
+
+// CHECK-LABEL: test_padd_i8x4:
+// CHECK:       padd.b
+int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_padd_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_u8x4:
+// CHECK:       padd.b
+uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_padd_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_i16x2:
+// CHECK:       padd.h
+int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_padd_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_u16x2:
+// CHECK:       padd.h
+uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_padd_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_psub_i8x4:
+// CHECK:       psub.b
+int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_psub_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_psub_u8x4:
+// CHECK:       psub.b
+uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_psub_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_psub_i16x2:
+// CHECK:       psub.h
+int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psub_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_psub_u16x2:
+// CHECK:       psub.h
+uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psub_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pneg_i8x4:
+// CHECK:       pneg.b
+int8x4_t test_pneg_i8x4(int8x4_t a) { return __riscv_pneg_i8x4(a); }
+
+// CHECK-LABEL: test_pneg_i16x2:
+// CHECK:       pneg.h
+int16x2_t test_pneg_i16x2(int16x2_t a) { return __riscv_pneg_i16x2(a); }
+
+// CHECK-LABEL: test_padd_i8x8:
+// RV32:        padd.db
+// RV64:        padd.b
+int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_padd_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_u8x8:
+// RV32:        padd.db
+// RV64:        padd.b
+uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_padd_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_i16x4:
+// RV32:        padd.dh
+// RV64:        padd.h
+int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_padd_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_u16x4:
+// RV32:        padd.dh
+// RV64:        padd.h
+uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_padd_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_i32x2:
+// RV32:        padd.dw
+// RV64:        padd.w
+int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_padd_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_u32x2:
+// RV32:        padd.dw
+// RV64:        padd.w
+uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_padd_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_psub_i8x8:
+// RV32:        psub.db
+// RV64:        psub.b
+int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_psub_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_psub_u8x8:
+// RV32:        psub.db
+// RV64:        psub.b
+uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_psub_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_psub_i16x4:
+// RV32:        psub.dh
+// RV64:        psub.h
+int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psub_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_psub_u16x4:
+// RV32:        psub.dh
+// RV64:        psub.h
+uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psub_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_psub_i32x2:
+// RV32:        psub.dw
+// RV64:        psub.w
+int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psub_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_psub_u32x2:
+// RV32:        psub.dw
+// RV64:        psub.w
+uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psub_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pneg_i8x8:
+// RV32:        pneg.db
+// RV64:        pneg.b
+int8x8_t test_pneg_i8x8(int8x8_t a) { return __riscv_pneg_i8x8(a); }
+
+// CHECK-LABEL: test_pneg_i16x4:
+// RV32:        pneg.dh
+// RV64:        pneg.h
+int16x4_t test_pneg_i16x4(int16x4_t a) { return __riscv_pneg_i16x4(a); }
+
+// CHECK-LABEL: test_pneg_i32x2:
+// RV32:        pneg.dw
+// RV64:        pneg.w
+int32x2_t test_pneg_i32x2(int32x2_t a) { return __riscv_pneg_i32x2(a); }
+
+// CHECK-LABEL: test_padd_s_u8x4:
+// CHECK:       padd.bs
+uint8x4_t test_padd_s_u8x4(uint8x4_t a, uint8_t b) {
+  return __riscv_padd_s_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i8x4:
+// CHECK:       padd.bs
+int8x4_t test_padd_s_i8x4(int8x4_t a, int8_t b) {
+  return __riscv_padd_s_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u16x2:
+// CHECK:       padd.hs
+uint16x2_t test_padd_s_u16x2(uint16x2_t a, uint16_t b) {
+  return __riscv_padd_s_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i16x2:
+// CHECK:       padd.hs
+int16x2_t test_padd_s_i16x2(int16x2_t a, int16_t b) {
+  return __riscv_padd_s_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u8x8:
+// RV32:        padd.dbs
+// RV64:        padd.bs
+uint8x8_t test_padd_s_u8x8(uint8x8_t a, uint8_t b) {
+  return __riscv_padd_s_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i8x8:
+// RV32:        padd.dbs
+// RV64:        padd.bs
+int8x8_t test_padd_s_i8x8(int8x8_t a, int8_t b) {
+  return __riscv_padd_s_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u16x4:
+// RV32:        padd.dhs
+// RV64:        padd.hs
+uint16x4_t test_padd_s_u16x4(uint16x4_t a, uint16_t b) {
+  return __riscv_padd_s_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i16x4:
+// RV32:        padd.dhs
+// RV64:        padd.hs
+int16x4_t test_padd_s_i16x4(int16x4_t a, int16_t b) {
+  return __riscv_padd_s_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u32x2:
+// RV32:        padd.dws
+// RV64:        padd.ws
+uint32x2_t test_padd_s_u32x2(uint32x2_t a, uint32_t b) {
+  return __riscv_padd_s_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i32x2:
+// RV32:        padd.dws
+// RV64:        padd.ws
+int32x2_t test_padd_s_i32x2(int32x2_t a, int32_t b) {
+  return __riscv_padd_s_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i8x4:
+// CHECK:       psadd.b
+int8x4_t test_psadd_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_psadd_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i16x2:
+// CHECK:       psadd.h
+int16x2_t test_psadd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psadd_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u8x4:
+// CHECK:       psaddu.b
+uint8x4_t test_psaddu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_psaddu_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u16x2:
+// CHECK:       psaddu.h
+uint16x2_t test_psaddu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psaddu_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i8x4:
+// CHECK:       pssub.b
+int8x4_t test_pssub_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pssub_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i16x2:
+// CHECK:       pssub.h
+int16x2_t test_pssub_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pssub_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u8x4:
+// CHECK:       pssubu.b
+uint8x4_t test_pssubu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pssubu_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u16x2:
+// CHECK:       pssubu.h
+uint16x2_t test_pssubu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pssubu_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i8x8:
+// RV32:        psadd.db
+// RV64:        psadd.b
+int8x8_t test_psadd_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_psadd_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i16x4:
+// RV32:        psadd.dh
+// RV64:        psadd.h
+int16x4_t test_psadd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psadd_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i32x2:
+// RV32:        psadd.dw
+// RV64:        psadd.w
+int32x2_t test_psadd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psadd_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u8x8:
+// RV32:        psaddu.db
+// RV64:        psaddu.b
+uint8x8_t test_psaddu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_psaddu_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u16x4:
+// RV32:        psaddu.dh
+// RV64:        psaddu.h
+uint16x4_t test_psaddu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psaddu_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u32x2:
+// RV32:        psaddu.dw
+// RV64:        psaddu.w
+uint32x2_t test_psaddu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psaddu_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i8x8:
+// RV32:        pssub.db
+// RV64:        pssub.b
+int8x8_t test_pssub_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pssub_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i16x4:
+// RV32:        pssub.dh
+// RV64:        pssub.h
+int16x4_t test_pssub_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pssub_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i32x2:
+// RV32:        pssub.dw
+// RV64:        pssub.w
+int32x2_t test_pssub_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pssub_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u8x8:
+// RV32:        pssubu.db
+// RV64:        pssubu.b
+uint8x8_t test_pssubu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pssubu_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u16x4:
+// RV32:        pssubu.dh
+// RV64:        pssubu.h
+uint16x4_t test_pssubu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pssubu_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u32x2:
+// RV32:        pssubu.dw
+// RV64:        pssubu.w
+uint32x2_t test_pssubu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pssubu_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_i16x2:
+// CHECK:       psh1add.h
+int16x2_t test_psh1add_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psh1add_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_u16x2:
+// CHECK:       psh1add.h
+uint16x2_t test_psh1add_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psh1add_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pssh1sadd_i16x2:
+// CHECK:       pssh1sadd.h
+int16x2_t test_pssh1sadd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pssh1sadd_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_i16x4:
+// RV32:        psh1add.dh
+// RV64:        psh1add.h
+int16x4_t test_psh1add_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psh1add_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_u16x4:
+// RV32:        psh1add.dh
+// RV64:        psh1add.h
+uint16x4_t test_psh1add_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psh1add_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_i32x2:
+// RV32:        psh1add.dw
+// RV64:        psh1add.w
+int32x2_t test_psh1add_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psh1add_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_u32x2:
+// RV32:        psh1add.dw
+// RV64:        psh1add.w
+uint32x2_t test_psh1add_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psh1add_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pssh1sadd_i16x4:
+// RV32:        pssh1sadd.dh
+// RV64:        pssh1sadd.h
+int16x4_t test_pssh1sadd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pssh1sadd_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pssh1sadd_i32x2:
+// RV32:        pssh1sadd.dw
+// RV64:        pssh1sadd.w
+int32x2_t test_pssh1sadd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pssh1sadd_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i8x4:
+// CHECK:       pmin.b
+int8x4_t test_pmin_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pmin_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i16x2:
+// CHECK:       pmin.h
+int16x2_t test_pmin_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pmin_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u8x4:
+// CHECK:       pminu.b
+uint8x4_t test_pminu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pminu_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u16x2:
+// CHECK:       pminu.h
+uint16x2_t test_pminu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pminu_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i8x4:
+// CHECK:       pmax.b
+int8x4_t test_pmax_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pmax_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i16x2:
+// CHECK:       pmax.h
+int16x2_t test_pmax_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pmax_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u8x4:
+// CHECK:       pmaxu.b
+uint8x4_t test_pmaxu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pmaxu_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u16x2:
+// CHECK:       pmaxu.h
+uint16x2_t test_pmaxu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pmaxu_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i8x8:
+// RV32:        pmin.db
+// RV64:        pmin.b
+int8x8_t test_pmin_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pmin_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i16x4:
+// RV32:        pmin.dh
+// RV64:        pmin.h
+int16x4_t test_pmin_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pmin_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i32x2:
+// RV32:        pmin.dw
+// RV64:        pmin.w
+int32x2_t test_pmin_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pmin_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u8x8:
+// RV32:        pminu.db
+// RV64:        pminu.b
+uint8x8_t test_pminu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pminu_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u16x4:
+// RV32:        pminu.dh
+// RV64:        pminu.h
+uint16x4_t test_pminu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pminu_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u32x2:
+// RV32:        pminu.dw
+// RV64:        pminu.w
+uint32x2_t test_pminu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pminu_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i8x8:
+// RV32:        pmax.db
+// RV64:        pmax.b
+int8x8_t test_pmax_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pmax_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i16x4:
+// RV32:        pmax.dh
+// RV64:        pmax.h
+int16x4_t test_pmax_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pmax_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i32x2:
+// RV32:        pmax.dw
+// RV64:        pmax.w
+int32x2_t test_pmax_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pmax_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u8x8:
+// RV32:        pmaxu.db
+// RV64:        pmaxu.b
+uint8x8_t test_pmaxu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pmaxu_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u16x4:
+// RV32:        pmaxu.dh
+// RV64:        pmaxu.h
+uint16x4_t test_pmaxu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pmaxu_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u32x2:
+// RV32:        pmaxu.dw
+// RV64:        pmaxu.w
+uint32x2_t test_pmaxu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pmaxu_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_psll_s_u8x4:
+// CHECK:       psll.bs
+uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned n) {
+  return __riscv_psll_s_u8x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i8x4:
+// CHECK:       psll.bs
+int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned n) {
+  return __riscv_psll_s_i8x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u16x2:
+// CHECK:       psll.hs
+uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned n) {
+  return __riscv_psll_s_u16x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i16x2:
+// CHECK:       psll.hs
+int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned n) {
+  return __riscv_psll_s_i16x2(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x4:
+// CHECK:       psrl.bs
+uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned n) {
+  return __riscv_psrl_s_u8x4(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x2:
+// CHECK:       psrl.hs
+uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned n) {
+  return __riscv_psrl_s_u16x2(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i8x4:
+// CHECK:       psra.bs
+int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned n) {
+  return __riscv_psra_s_i8x4(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i16x2:
+// CHECK:       psra.hs
+int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned n) {
+  return __riscv_psra_s_i16x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u8x4_imm:
+// CHECK:       pslli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x4_t test_psll_s_u8x4_imm(uint8x4_t a) {
+  return __riscv_psll_s_u8x4(a, 2);
+}
+
+// CHECK-LABEL: test_psll_s_i8x4_imm:
+// CHECK:       pslli.b{{[[:space:]]+}}{{.*}}, 3
+int8x4_t test_psll_s_i8x4_imm(int8x4_t a) { return __riscv_psll_s_i8x4(a, 3); }
+
+// CHECK-LABEL: test_psll_s_u16x2_imm:
+// CHECK:       pslli.h{{[[:space:]]+}}{{.*}}, 5
+uint16x2_t test_psll_s_u16x2_imm(uint16x2_t a) {
+  return __riscv_psll_s_u16x2(a, 5);
+}
+
+// CHECK-LABEL: test_psll_s_i16x2_imm:
+// CHECK:       pslli.h{{[[:space:]]+}}{{.*}}, 7
+int16x2_t test_psll_s_i16x2_imm(int16x2_t a) {
+  return __riscv_psll_s_i16x2(a, 7);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x4_imm:
+// CHECK:       psrli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x4_t test_psrl_s_u8x4_imm(uint8x4_t a) {
+  return __riscv_psrl_s_u8x4(a, 2);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x2_imm:
+// CHECK:       psrli.h{{[[:space:]]+}}{{.*}}, 3
+uint16x2_t test_psrl_s_u16x2_imm(uint16x2_t a) {
+  return __riscv_psrl_s_u16x2(a, 3);
+}
+
+// CHECK-LABEL: test_psra_s_i8x4_imm:
+// CHECK:       psrai.b{{[[:space:]]+}}{{.*}}, 4
+int8x4_t test_psra_s_i8x4_imm(int8x4_t a) { return __riscv_psra_s_i8x4(a, 4); }
+
+// CHECK-LABEL: test_psra_s_i16x2_imm:
+// CHECK:       psrai.h{{[[:space:]]+}}{{.*}}, 5
+int16x2_t test_psra_s_i16x2_imm(int16x2_t a) {
+  return __riscv_psra_s_i16x2(a, 5);
+}
+
+// CHECK-LABEL: test_psll_s_u8x8:
+// RV32:        psll.dbs
+// RV64:        psll.bs
+uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned n) {
+  return __riscv_psll_s_u8x8(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i8x8:
+// RV32:        psll.dbs
+// RV64:        psll.bs
+int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned n) {
+  return __riscv_psll_s_i8x8(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u16x4:
+// RV32:        psll.dhs
+// RV64:        psll.hs
+uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned n) {
+  return __riscv_psll_s_u16x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i16x4:
+// RV32:        psll.dhs
+// RV64:        psll.hs
+int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned n) {
+  return __riscv_psll_s_i16x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u32x2:
+// RV32:        psll.dws
+// RV64:        psll.ws
+uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned n) {
+  return __riscv_psll_s_u32x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i32x2:
+// RV32:        psll.dws
+// RV64:        psll.ws
+int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned n) {
+  return __riscv_psll_s_i32x2(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x8:
+// RV32:        psrl.dbs
+// RV64:        psrl.bs
+uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned n) {
+  return __riscv_psrl_s_u8x8(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x4:
+// RV32:        psrl.dhs
+// RV64:        psrl.hs
+uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned n) {
+  return __riscv_psrl_s_u16x4(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u32x2:
+// RV32:        psrl.dws
+// RV64:        psrl.ws
+uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned n) {
+  return __riscv_psrl_s_u32x2(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i8x8:
+// RV32:        psra.dbs
+// RV64:        psra.bs
+int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned n) {
+  return __riscv_psra_s_i8x8(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i16x4:
+// RV32:        psra.dhs
+// RV64:        psra.hs
+int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned n) {
+  return __riscv_psra_s_i16x4(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i32x2:
+// RV32:        psra.dws
+// RV64:        psra.ws
+int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned n) {
+  return __riscv_psra_s_i32x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u8x8_imm:
+// RV32:        pslli.db{{[[:space:]]+}}{{.*}}, 2
+// RV64:        pslli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x8_t test_psll_s_u8x8_imm(uint8x8_t a) {
+  return __riscv_psll_s_u8x8(a, 2);
+}
+
+// CHECK-LABEL: test_psll_s_i8x8_imm:
+// RV32:        pslli.db{{[[:space:]]+}}{{.*}}, 3
+// RV64:        pslli.b{{[[:space:]]+}}{{.*}}, 3
+int8x8_t test_psll_s_i8x8_imm(int8x8_t a) { return __riscv_psll_s_i8x8(a, 3); }
+
+// CHECK-LABEL: test_psll_s_u16x4_imm:
+// RV32:        pslli.dh{{[[:space:]]+}}{{.*}}, 4
+// RV64:        pslli.h{{[[:space:]]+}}{{.*}}, 4
+uint16x4_t test_psll_s_u16x4_imm(uint16x4_t a) {
+  return __riscv_psll_s_u16x4(a, 4);
+}
+
+// CHECK-LABEL: test_psll_s_i16x4_imm:
+// RV32:        pslli.dh{{[[:space:]]+}}{{.*}}, 5
+// RV64:        pslli.h{{[[:space:]]+}}{{.*}}, 5
+int16x4_t test_psll_s_i16x4_imm(int16x4_t a) {
+  return __riscv_psll_s_i16x4(a, 5);
+}
+
+// CHECK-LABEL: test_psll_s_u32x2_imm:
+// RV32:        pslli.dw{{[[:space:]]+}}{{.*}}, 7
+// RV64:        pslli.w{{[[:space:]]+}}{{.*}}, 7
+uint32x2_t test_psll_s_u32x2_imm(uint32x2_t a) {
+  return __riscv_psll_s_u32x2(a, 7);
+}
+
+// CHECK-LABEL: test_psll_s_i32x2_imm:
+// RV32:        pslli.dw{{[[:space:]]+}}{{.*}}, 9
+// RV64:        pslli.w{{[[:space:]]+}}{{.*}}, 9
+int32x2_t test_psll_s_i32x2_imm(int32x2_t a) {
+  return __riscv_psll_s_i32x2(a, 9);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x8_imm:
+// RV32:        psrli.db{{[[:space:]]+}}{{.*}}, 2
+// RV64:        psrli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x8_t test_psrl_s_u8x8_imm(uint8x8_t a) {
+  return __riscv_psrl_s_u8x8(a, 2);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x4_imm:
+// RV32:        psrli.dh{{[[:space:]]+}}{{.*}}, 3
+// RV64:        psrli.h{{[[:space:]]+}}{{.*}}, 3
+uint16x4_t test_psrl_s_u16x4_imm(uint16x4_t a) {
+  return __riscv_psrl_s_u16x4(a, 3);
+}
+
+// CHECK-LABEL: test_psrl_s_u32x2_imm:
+// RV32:        psrli.dw{{[[:space:]]+}}{{.*}}, 5
+// RV64:        psrli.w{{[[:space:]]+}}{{.*}}, 5
+uint32x2_t test_psrl_s_u32x2_imm(uint32x2_t a) {
+  return __riscv_psrl_s_u32x2(a, 5);
+}
+
+// CHECK-LABEL: test_psra_s_i8x8_imm:
+// RV32:        psrai.db{{[[:space:]]+}}{{.*}}, 4
+// RV64:        psrai.b{{[[:space:]]+}}{{.*}}, 4
+int8x8_t test_psra_s_i8x8_imm(int8x8_t a) { return __riscv_psra_s_i8x8(a, 4); }
+
+// CHECK-LABEL: test_psra_s_i16x4_imm:
+// RV32:        psrai.dh{{[[:space:]]+}}{{.*}}, 5
+// RV64:        psrai.h{{[[:space:]]+}}{{.*}}, 5
+int16x4_t test_psra_s_i16x4_imm(int16x4_t a) {
+  return __riscv_psra_s_i16x4(a, 5);
+}
+
+// CHECK-LABEL: test_psra_s_i32x2_imm:
+// RV32:        psrai.dw{{[[:space:]]+}}{{.*}}, 11
+// RV64:        psrai.w{{[[:space:]]+}}{{.*}}, 11
+int32x2_t test_psra_s_i32x2_imm(int32x2_t a) {
+  return __riscv_psra_s_i32x2(a, 11);
+}
+
+// CHECK-LABEL: test_pand_i8x4:
+// CHECK:       and{{[[:space:]]}}
+int8x4_t test_pand_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pand_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pand_u8x4:
+// CHECK:       and{{[[:space:]]}}
+uint8x4_t test_pand_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pand_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pand_i16x2:
+// CHECK:       and{{[[:space:]]}}
+int16x2_t test_pand_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pand_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pand_u16x2:
+// CHECK:       and{{[[:space:]]}}
+uint16x2_t test_pand_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pand_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_por_i8x4:
+// CHECK:       or{{[[:space:]]}}
+int8x4_t test_por_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_por_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_por_u8x4:
+// CHECK:       or{{[[:space:]]}}
+uint8x4_t test_por_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_por_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_por_i16x2:
+// CHECK:       or{{[[:space:]]}}
+int16x2_t test_por_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_por_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_por_u16x2:
+// CHECK:       or{{[[:space:]]}}
+uint16x2_t test_por_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_por_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i8x4:
+// CHECK:       xor{{[[:space:]]}}
+int8x4_t test_pxor_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pxor_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u8x4:
+// CHECK:       xor{{[[:space:]]}}
+uint8x4_t test_pxor_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pxor_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i16x2:
+// CHECK:       xor{{[[:space:]]}}
+int16x2_t test_pxor_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pxor_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u16x2:
+// CHECK:       xor{{[[:space:]]}}
+uint16x2_t test_pxor_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pxor_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pnot_i8x4:
+// CHECK:       not{{[[:space:]]}}
+int8x4_t test_pnot_i8x4(int8x4_t a) { return __riscv_pnot_i8x4(a); }
+
+// CHECK-LABEL: test_pnot_u8x4:
+// CHECK:       not{{[[:space:]]}}
+uint8x4_t test_pnot_u8x4(uint8x4_t a) { return __riscv_pnot_u8x4(a); }
+
+// CHECK-LABEL: test_pnot_i16x2:
+// CHECK:       not{{[[:space:]]}}
+int16x2_t test_pnot_i16x2(int16x2_t a) { return __riscv_pnot_i16x2(a); }
+
+// CHECK-LABEL: test_pnot_u16x2:
+// CHECK:       not{{[[:space:]]}}
+uint16x2_t test_pnot_u16x2(uint16x2_t a) { return __riscv_pnot_u16x2(a); }
+
+// CHECK-LABEL: test_pand_i8x8:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64:         and{{[[:space:]]}}
+int8x8_t test_pand_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pand_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pand_u8x8:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64:         and{{[[:space:]]}}
+uint8x8_t test_pand_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pand_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pand_i16x4:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64:         and{{[[:space:]]}}
+int16x4_t test_pand_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pand_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pand_u16x4:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64:         and{{[[:space:]]}}
+uint16x4_t test_pand_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pand_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pand_i32x2:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64:         and{{[[:space:]]}}
+int32x2_t test_pand_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pand_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pand_u32x2:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64:         and{{[[:space:]]}}
+uint32x2_t test_pand_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pand_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_por_i8x8:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64:         or{{[[:space:]]}}
+int8x8_t test_por_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_por_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_por_u8x8:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64:         or{{[[:space:]]}}
+uint8x8_t test_por_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_por_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_por_i16x4:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64:         or{{[[:space:]]}}
+int16x4_t test_por_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_por_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_por_u16x4:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64:         or{{[[:space:]]}}
+uint16x4_t test_por_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_por_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_por_i32x2:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64:         or{{[[:space:]]}}
+int32x2_t test_por_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_por_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_por_u32x2:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64:         or{{[[:space:]]}}
+uint32x2_t test_por_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_por_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i8x8:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64:         xor{{[[:space:]]}}
+int8x8_t test_pxor_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pxor_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u8x8:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64:         xor{{[[:space:]]}}
+uint8x8_t test_pxor_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pxor_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i16x4:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64:         xor{{[[:space:]]}}
+int16x4_t test_pxor_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pxor_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u16x4:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64:         xor{{[[:space:]]}}
+uint16x4_t test_pxor_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pxor_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i32x2:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64:         xor{{[[:space:]]}}
+int32x2_t test_pxor_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pxor_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u32x2:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64:         xor{{[[:space:]]}}
+uint32x2_t test_pxor_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pxor_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pnot_i8x8:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64:         not{{[[:space:]]}}
+int8x8_t test_pnot_i8x8(int8x8_t a) { return __riscv_pnot_i8x8(a); }
+
+// CHECK-LABEL: test_pnot_u8x8:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64:         not{{[[:space:]]}}
+uint8x8_t test_pnot_u8x8(uint8x8_t a) { return __riscv_pnot_u8x8(a); }
+
+// CHECK-LABEL: test_pnot_i16x4:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64:         not{{[[:space:]]}}
+int16x4_t test_pnot_i16x4(int16x4_t a) { return __riscv_pnot_i16x4(a); }
+
+// CHECK-LABEL: test_pnot_u16x4:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64:         not{{[[:space:]]}}
+uint16x4_t test_pnot_u16x4(uint16x4_t a) { return __riscv_pnot_u16x4(a); }
+
+// CHECK-LABEL: test_pnot_i32x2:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64:         not{{[[:space:]]}}
+int32x2_t test_pnot_i32x2(int32x2_t a) { return __riscv_pnot_i32x2(a); }
+
+// CHECK-LABEL: test_pnot_u32x2:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64:         not{{[[:space:]]}}
+uint32x2_t test_pnot_u32x2(uint32x2_t a) { return __riscv_pnot_u32x2(a); }


        


More information about the cfe-commits mailing list