[clang] [llvm] [RISCV] Add riscv_packed_simd.h for P extension intrinsics (PR #181115)

via cfe-commits cfe-commits at lists.llvm.org
Mon Jun 8 18:01:11 PDT 2026


https://github.com/sihuan updated https://github.com/llvm/llvm-project/pull/181115

>From 846f9ab5d88ecf42b75170b19b938eab059dede4 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Fri, 30 Jan 2026 00:41:13 +0800
Subject: [PATCH 01/19] [Clang][RISCV] Add riscv_simd.h for P extension
 intrinsics

This patch adds `riscv_simd.h`, introducing initial support for RISC-V P extension intrinsics.

The supported operations include:
- Packed addition and subtraction (padd, psub)
- Packed logic and arithmetic shifts (psll, psrl, psra)

These intrinsics are implemented using standard C operators to generate canonical LLVM IR (e.g., `add <4 x i8>`, `shl <2 x i16>`). The implementation relies on the RISC-V backend to correctly lower this IR to specific P extension instructions.
---
 clang/lib/Headers/CMakeLists.txt          |    1 +
 clang/lib/Headers/riscv_simd.h            |  245 +++++
 clang/test/CodeGen/RISCV/rvp-intrinsics.c | 1009 +++++++++++++++++++++
 3 files changed, 1255 insertions(+)
 create mode 100644 clang/lib/Headers/riscv_simd.h
 create mode 100644 clang/test/CodeGen/RISCV/rvp-intrinsics.c

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ce34f8b9410a7..968e6234c0949 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -140,6 +140,7 @@ set(riscv_files
   riscv_corev_alu.h
   riscv_mips.h
   riscv_nds.h
+  riscv_simd.h
   sifive_vector.h
   )
 
diff --git a/clang/lib/Headers/riscv_simd.h b/clang/lib/Headers/riscv_simd.h
new file mode 100644
index 0000000000000..262f35b483cbd
--- /dev/null
+++ b/clang/lib/Headers/riscv_simd.h
@@ -0,0 +1,245 @@
+/*===---- riscv_simd.h - RISC-V P intrinsics -----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_SIMD_H
+#define __RISCV_SIMD_H
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Packed SIMD Types */
+
+typedef int8_t int8x4_t __attribute__((vector_size(4)));
+typedef uint8_t uint8x4_t __attribute__((vector_size(4)));
+typedef int16_t int16x2_t __attribute__((vector_size(4)));
+typedef uint16_t uint16x2_t __attribute__((vector_size(4)));
+
+typedef int8_t int8x8_t __attribute__((vector_size(8)));
+typedef uint8_t uint8x8_t __attribute__((vector_size(8)));
+typedef int16_t int16x4_t __attribute__((vector_size(8)));
+typedef uint16_t uint16x4_t __attribute__((vector_size(8)));
+typedef int32_t int32x2_t __attribute__((vector_size(8)));
+typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
+
+/* Packed Addition and Subtraction (32-bit) */
+
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_i8x4(int8x4_t __rs1, int8x4_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_u8x4(uint8x4_t __rs1, uint8x4_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_i16x2(int16x2_t __rs1, int16x2_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_u16x2(uint16x2_t __rs1, uint16x2_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_i8x4(int8x4_t __rs1, int8x4_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_u8x4(uint8x4_t __rs1, uint8x4_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_i16x2(int16x2_t __rs1, int16x2_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_u16x2(uint16x2_t __rs1, uint16x2_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+/* Packed Addition and Subtraction (64-bit) */
+
+static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_i8x8(int8x8_t __rs1, int8x8_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_u8x8(uint8x8_t __rs1, uint8x8_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_i16x4(int16x4_t __rs1, int16x4_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_u16x4(uint16x4_t __rs1, uint16x4_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_i32x2(int32x2_t __rs1, int32x2_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_u32x2(uint32x2_t __rs1, uint32x2_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_i8x8(int8x8_t __rs1, int8x8_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_u8x8(uint8x8_t __rs1, uint8x8_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_i16x4(int16x4_t __rs1, int16x4_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_u16x4(uint16x4_t __rs1, uint16x4_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_i32x2(int32x2_t __rs1, int32x2_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_u32x2(uint32x2_t __rs1, uint32x2_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+/* Packed Shifts (32-bit) */
+
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_u8x4(uint8x4_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_i8x4(int8x4_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_u16x2(uint16x2_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_i16x2(int16x2_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psrl_s_u8x4(uint8x4_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psrl_s_u16x2(uint16x2_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psra_s_i8x4(int8x4_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psra_s_i16x2(int16x2_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+/* Packed Shifts (64-bit) */
+
+static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_u8x8(uint8x8_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_i8x8(int8x8_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_u16x4(uint16x4_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_i16x4(int16x4_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_u32x2(uint32x2_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_i32x2(int32x2_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psrl_s_u8x8(uint8x8_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psrl_s_u16x4(uint16x4_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psrl_s_u32x2(uint32x2_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psra_s_i8x8(int8x8_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psra_s_i16x4(int16x4_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psra_s_i32x2(int32x2_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __RISCV_SIMD_H */
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
new file mode 100644
index 0000000000000..40a21fa071387
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -0,0 +1,1009 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple riscv32 -target-feature +experimental-p \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=sroa,instcombine | FileCheck %s --check-prefix=RV32
+// RUN: %clang_cc1 -triple riscv64 -target-feature +experimental-p \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=sroa,instcombine | FileCheck %s --check-prefix=RV64
+
+#include <riscv_simd.h>
+
+/* 32-bit Packed Addition and Subtraction */
+
+// RV32-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_padd_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_padd_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_padd_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_padd_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_psub_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_psub_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psub_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psub_u16x2(a, b);
+}
+
+/* 64-bit Packed Addition and Subtraction */
+
+// RV32-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_padd_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_padd_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_padd_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_padd_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_padd_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_padd_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_psub_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_psub_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psub_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psub_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psub_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psub_u32x2(a, b);
+}
+
+/* 32-bit Packed Shifts */
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
+  return __riscv_psll_s_i8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
+  return __riscv_psll_s_u8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
+  return __riscv_psll_s_i16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
+  return __riscv_psll_s_u16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psra_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
+  return __riscv_psra_s_i8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
+  return __riscv_psrl_s_u8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psra_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
+  return __riscv_psra_s_i16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
+  return __riscv_psrl_s_u16x2(a, shamt);
+}
+
+/* 64-bit Packed Shifts */
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
+  return __riscv_psll_s_i8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
+  return __riscv_psll_s_u8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
+  return __riscv_psll_s_i16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
+  return __riscv_psll_s_u16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
+  return __riscv_psll_s_i32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned shamt) {
+  return __riscv_psll_s_u32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
+  return __riscv_psra_s_i8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
+  return __riscv_psrl_s_u8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
+  return __riscv_psra_s_i16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
+  return __riscv_psrl_s_u16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
+  return __riscv_psra_s_i32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned shamt) {
+  return __riscv_psrl_s_u32x2(a, shamt);
+}

>From f7679a66188a463861294d319f230876af2df2d5 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Fri, 13 Feb 2026 10:14:32 +0800
Subject: [PATCH 02/19] Fix list alphabetization and line alignment

---
 clang/lib/Headers/riscv_simd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Headers/riscv_simd.h b/clang/lib/Headers/riscv_simd.h
index 262f35b483cbd..21d4d01628562 100644
--- a/clang/lib/Headers/riscv_simd.h
+++ b/clang/lib/Headers/riscv_simd.h
@@ -1,4 +1,4 @@
-/*===---- riscv_simd.h - RISC-V P intrinsics -----------------===
+/*===---- riscv_simd.h - RISC-V P intrinsics -------------------------------===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.

>From 4b132cabcf5c9e62dc7fa6a4b4830ae1c6c92bb2 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Tue, 24 Feb 2026 14:24:08 +0800
Subject: [PATCH 03/19] Rename P-extension header to riscv_packed.h

---
 clang/lib/Headers/CMakeLists.txt                   | 2 +-
 clang/lib/Headers/{riscv_simd.h => riscv_packed.h} | 8 ++++----
 clang/test/CodeGen/RISCV/rvp-intrinsics.c          | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)
 rename clang/lib/Headers/{riscv_simd.h => riscv_packed.h} (98%)

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 968e6234c0949..59cd039747ae6 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -140,7 +140,7 @@ set(riscv_files
   riscv_corev_alu.h
   riscv_mips.h
   riscv_nds.h
-  riscv_simd.h
+  riscv_packed.h
   sifive_vector.h
   )
 
diff --git a/clang/lib/Headers/riscv_simd.h b/clang/lib/Headers/riscv_packed.h
similarity index 98%
rename from clang/lib/Headers/riscv_simd.h
rename to clang/lib/Headers/riscv_packed.h
index 21d4d01628562..ef2d7b878c2eb 100644
--- a/clang/lib/Headers/riscv_simd.h
+++ b/clang/lib/Headers/riscv_packed.h
@@ -1,4 +1,4 @@
-/*===---- riscv_simd.h - RISC-V P intrinsics -------------------------------===
+/*===---- riscv_packed.h - RISC-V P intrinsics -----------------------------===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -7,8 +7,8 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef __RISCV_SIMD_H
-#define __RISCV_SIMD_H
+#ifndef __RISCV_PACKED_H
+#define __RISCV_PACKED_H
 
 #include <stdint.h>
 
@@ -242,4 +242,4 @@ __riscv_psra_s_i32x2(int32x2_t __rs1, unsigned __shamt) {
 }
 #endif
 
-#endif /* __RISCV_SIMD_H */
+#endif /* __RISCV_PACKED_H */
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index 40a21fa071387..c80a6ad4e95e7 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -6,7 +6,7 @@
 // RUN:   -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=sroa,instcombine | FileCheck %s --check-prefix=RV64
 
-#include <riscv_simd.h>
+#include <riscv_packed.h>
 
 /* 32-bit Packed Addition and Subtraction */
 

>From f01e439ff7ec63e985adb9c23dded6a0bce55909 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Wed, 25 Feb 2026 17:26:03 +0800
Subject: [PATCH 04/19] [RISCV] Refactor P-extension intrinsics with macros

Co-authored-by: Alexander Richardson <alexrichardson at google.com>
---
 clang/lib/Headers/riscv_packed.h | 251 +++++++------------------------
 1 file changed, 52 insertions(+), 199 deletions(-)

diff --git a/clang/lib/Headers/riscv_packed.h b/clang/lib/Headers/riscv_packed.h
index ef2d7b878c2eb..07822f257630e 100644
--- a/clang/lib/Headers/riscv_packed.h
+++ b/clang/lib/Headers/riscv_packed.h
@@ -30,213 +30,66 @@ typedef uint16_t uint16x4_t __attribute__((vector_size(8)));
 typedef int32_t int32x2_t __attribute__((vector_size(8)));
 typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
 
-/* Packed Addition and Subtraction (32-bit) */
-
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_i8x4(int8x4_t __rs1, int8x4_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_u8x4(uint8x4_t __rs1, uint8x4_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_i16x2(int16x2_t __rs1, int16x2_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_u16x2(uint16x2_t __rs1, uint16x2_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_i8x4(int8x4_t __rs1, int8x4_t __rs2) {
-  return __rs1 - __rs2;
-}
+#define _packed_binop(name, retty, ty1, ty2, op)                               \
+  static __inline__ retty __attribute__((__always_inline__, __nodebug__))      \
+  __riscv_##name(ty1 __rs1, ty2 __rs2) {                                       \
+    return __rs1 op __rs2;                                                     \
+  }
 
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_u8x4(uint8x4_t __rs1, uint8x4_t __rs2) {
-  return __rs1 - __rs2;
-}
+#define _packed_addsub(name, ty, op) _packed_binop(name, ty, ty, ty, op)
+#define _packed_shift(name, ty, op) _packed_binop(name, ty, ty, unsigned, op)
 
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_i16x2(int16x2_t __rs1, int16x2_t __rs2) {
-  return __rs1 - __rs2;
-}
-
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_u16x2(uint16x2_t __rs1, uint16x2_t __rs2) {
-  return __rs1 - __rs2;
-}
+/* Packed Addition and Subtraction (32-bit) */
+_packed_addsub(padd_i8x4, int8x4_t, +)
+_packed_addsub(padd_u8x4, uint8x4_t, +)
+_packed_addsub(padd_i16x2, int16x2_t, +)
+_packed_addsub(padd_u16x2, uint16x2_t, +)
+_packed_addsub(psub_i8x4, int8x4_t, -)
+_packed_addsub(psub_u8x4, uint8x4_t, -)
+_packed_addsub(psub_i16x2, int16x2_t, -)
+_packed_addsub(psub_u16x2, uint16x2_t, -)
 
 /* Packed Addition and Subtraction (64-bit) */
-
-static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_i8x8(int8x8_t __rs1, int8x8_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_u8x8(uint8x8_t __rs1, uint8x8_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_i16x4(int16x4_t __rs1, int16x4_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_u16x4(uint16x4_t __rs1, uint16x4_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_i32x2(int32x2_t __rs1, int32x2_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_u32x2(uint32x2_t __rs1, uint32x2_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_i8x8(int8x8_t __rs1, int8x8_t __rs2) {
-  return __rs1 - __rs2;
-}
-
-static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_u8x8(uint8x8_t __rs1, uint8x8_t __rs2) {
-  return __rs1 - __rs2;
-}
-
-static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_i16x4(int16x4_t __rs1, int16x4_t __rs2) {
-  return __rs1 - __rs2;
-}
-
-static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_u16x4(uint16x4_t __rs1, uint16x4_t __rs2) {
-  return __rs1 - __rs2;
-}
-
-static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_i32x2(int32x2_t __rs1, int32x2_t __rs2) {
-  return __rs1 - __rs2;
-}
-
-static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_u32x2(uint32x2_t __rs1, uint32x2_t __rs2) {
-  return __rs1 - __rs2;
-}
+_packed_addsub(padd_i8x8, int8x8_t, +)
+_packed_addsub(padd_u8x8, uint8x8_t, +)
+_packed_addsub(padd_i16x4, int16x4_t, +)
+_packed_addsub(padd_u16x4, uint16x4_t, +)
+_packed_addsub(padd_i32x2, int32x2_t, +)
+_packed_addsub(padd_u32x2, uint32x2_t, +)
+_packed_addsub(psub_i8x8, int8x8_t, -)
+_packed_addsub(psub_u8x8, uint8x8_t, -)
+_packed_addsub(psub_i16x4, int16x4_t, -)
+_packed_addsub(psub_u16x4, uint16x4_t, -)
+_packed_addsub(psub_i32x2, int32x2_t, -)
+_packed_addsub(psub_u32x2, uint32x2_t, -)
 
 /* Packed Shifts (32-bit) */
-
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_u8x4(uint8x4_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_i8x4(int8x4_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_u16x2(uint16x2_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_i16x2(int16x2_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psrl_s_u8x4(uint8x4_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psrl_s_u16x2(uint16x2_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psra_s_i8x4(int8x4_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psra_s_i16x2(int16x2_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
+_packed_shift(psll_s_u8x4, uint8x4_t, <<)
+_packed_shift(psll_s_i8x4, int8x4_t, <<)
+_packed_shift(psll_s_u16x2, uint16x2_t, <<)
+_packed_shift(psll_s_i16x2, int16x2_t, <<)
+_packed_shift(psrl_s_u8x4, uint8x4_t, >>)
+_packed_shift(psrl_s_u16x2, uint16x2_t, >>)
+_packed_shift(psra_s_i8x4, int8x4_t, >>)
+_packed_shift(psra_s_i16x2, int16x2_t, >>)
 
 /* Packed Shifts (64-bit) */
-
-static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_u8x8(uint8x8_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_i8x8(int8x8_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_u16x4(uint16x4_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_i16x4(int16x4_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_u32x2(uint32x2_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_i32x2(int32x2_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psrl_s_u8x8(uint8x8_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psrl_s_u16x4(uint16x4_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psrl_s_u32x2(uint32x2_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psra_s_i8x8(int8x8_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psra_s_i16x4(int16x4_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psra_s_i32x2(int32x2_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
+_packed_shift(psll_s_u8x8, uint8x8_t, <<)
+_packed_shift(psll_s_i8x8, int8x8_t, <<)
+_packed_shift(psll_s_u16x4, uint16x4_t, <<)
+_packed_shift(psll_s_i16x4, int16x4_t, <<)
+_packed_shift(psll_s_u32x2, uint32x2_t, <<)
+_packed_shift(psll_s_i32x2, int32x2_t, <<)
+_packed_shift(psrl_s_u8x8, uint8x8_t, >>)
+_packed_shift(psrl_s_u16x4, uint16x4_t, >>)
+_packed_shift(psrl_s_u32x2, uint32x2_t, >>)
+_packed_shift(psra_s_i8x8, int8x8_t, >>)
+_packed_shift(psra_s_i16x4, int16x4_t, >>)
+_packed_shift(psra_s_i32x2, int32x2_t, >>)
+
+#undef _packed_addsub
+#undef _packed_shift
+#undef _packed_binop
 
 #if defined(__cplusplus)
 }

>From ff65675efbb4d6124e6cc9f398e1006f40869741 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Mon, 2 Mar 2026 14:05:56 +0800
Subject: [PATCH 05/19] [RISCV] Standardize P-extension intrinsics macros and
 types

---
 clang/lib/Headers/riscv_packed.h | 112 +++++++++++++++----------------
 1 file changed, 56 insertions(+), 56 deletions(-)

diff --git a/clang/lib/Headers/riscv_packed.h b/clang/lib/Headers/riscv_packed.h
index 07822f257630e..b201c1f1d3f0b 100644
--- a/clang/lib/Headers/riscv_packed.h
+++ b/clang/lib/Headers/riscv_packed.h
@@ -18,78 +18,78 @@ extern "C" {
 
 /* Packed SIMD Types */
 
-typedef int8_t int8x4_t __attribute__((vector_size(4)));
-typedef uint8_t uint8x4_t __attribute__((vector_size(4)));
-typedef int16_t int16x2_t __attribute__((vector_size(4)));
-typedef uint16_t uint16x2_t __attribute__((vector_size(4)));
+typedef int8_t int8x4_t __attribute__((__vector_size__(4), __aligned__(4)));
+typedef uint8_t uint8x4_t __attribute__((__vector_size__(4), __aligned__(4)));
+typedef int16_t int16x2_t __attribute__((__vector_size__(4), __aligned__(4)));
+typedef uint16_t uint16x2_t __attribute__((__vector_size__(4), __aligned__(4)));
 
-typedef int8_t int8x8_t __attribute__((vector_size(8)));
-typedef uint8_t uint8x8_t __attribute__((vector_size(8)));
-typedef int16_t int16x4_t __attribute__((vector_size(8)));
-typedef uint16_t uint16x4_t __attribute__((vector_size(8)));
-typedef int32_t int32x2_t __attribute__((vector_size(8)));
-typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
+typedef int8_t int8x8_t __attribute__((__vector_size__(8), __aligned__(8)));
+typedef uint8_t uint8x8_t __attribute__((__vector_size__(8), __aligned__(8)));
+typedef int16_t int16x4_t __attribute__((__vector_size__(8), __aligned__(8)));
+typedef uint16_t uint16x4_t __attribute__((__vector_size__(8), __aligned__(8)));
+typedef int32_t int32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
+typedef uint32_t uint32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
 
-#define _packed_binop(name, retty, ty1, ty2, op)                               \
+#define __packed_binop(name, retty, ty1, ty2, op)                              \
   static __inline__ retty __attribute__((__always_inline__, __nodebug__))      \
   __riscv_##name(ty1 __rs1, ty2 __rs2) {                                       \
     return __rs1 op __rs2;                                                     \
   }
 
-#define _packed_addsub(name, ty, op) _packed_binop(name, ty, ty, ty, op)
-#define _packed_shift(name, ty, op) _packed_binop(name, ty, ty, unsigned, op)
+#define __packed_addsub(name, ty, op) __packed_binop(name, ty, ty, ty, op)
+#define __packed_shift(name, ty, op) __packed_binop(name, ty, ty, unsigned, op)
 
 /* Packed Addition and Subtraction (32-bit) */
-_packed_addsub(padd_i8x4, int8x4_t, +)
-_packed_addsub(padd_u8x4, uint8x4_t, +)
-_packed_addsub(padd_i16x2, int16x2_t, +)
-_packed_addsub(padd_u16x2, uint16x2_t, +)
-_packed_addsub(psub_i8x4, int8x4_t, -)
-_packed_addsub(psub_u8x4, uint8x4_t, -)
-_packed_addsub(psub_i16x2, int16x2_t, -)
-_packed_addsub(psub_u16x2, uint16x2_t, -)
+__packed_addsub(padd_i8x4, int8x4_t, +)
+__packed_addsub(padd_u8x4, uint8x4_t, +)
+__packed_addsub(padd_i16x2, int16x2_t, +)
+__packed_addsub(padd_u16x2, uint16x2_t, +)
+__packed_addsub(psub_i8x4, int8x4_t, -)
+__packed_addsub(psub_u8x4, uint8x4_t, -)
+__packed_addsub(psub_i16x2, int16x2_t, -)
+__packed_addsub(psub_u16x2, uint16x2_t, -)
 
 /* Packed Addition and Subtraction (64-bit) */
-_packed_addsub(padd_i8x8, int8x8_t, +)
-_packed_addsub(padd_u8x8, uint8x8_t, +)
-_packed_addsub(padd_i16x4, int16x4_t, +)
-_packed_addsub(padd_u16x4, uint16x4_t, +)
-_packed_addsub(padd_i32x2, int32x2_t, +)
-_packed_addsub(padd_u32x2, uint32x2_t, +)
-_packed_addsub(psub_i8x8, int8x8_t, -)
-_packed_addsub(psub_u8x8, uint8x8_t, -)
-_packed_addsub(psub_i16x4, int16x4_t, -)
-_packed_addsub(psub_u16x4, uint16x4_t, -)
-_packed_addsub(psub_i32x2, int32x2_t, -)
-_packed_addsub(psub_u32x2, uint32x2_t, -)
+__packed_addsub(padd_i8x8, int8x8_t, +)
+__packed_addsub(padd_u8x8, uint8x8_t, +)
+__packed_addsub(padd_i16x4, int16x4_t, +)
+__packed_addsub(padd_u16x4, uint16x4_t, +)
+__packed_addsub(padd_i32x2, int32x2_t, +)
+__packed_addsub(padd_u32x2, uint32x2_t, +)
+__packed_addsub(psub_i8x8, int8x8_t, -)
+__packed_addsub(psub_u8x8, uint8x8_t, -)
+__packed_addsub(psub_i16x4, int16x4_t, -)
+__packed_addsub(psub_u16x4, uint16x4_t, -)
+__packed_addsub(psub_i32x2, int32x2_t, -)
+__packed_addsub(psub_u32x2, uint32x2_t, -)
 
 /* Packed Shifts (32-bit) */
-_packed_shift(psll_s_u8x4, uint8x4_t, <<)
-_packed_shift(psll_s_i8x4, int8x4_t, <<)
-_packed_shift(psll_s_u16x2, uint16x2_t, <<)
-_packed_shift(psll_s_i16x2, int16x2_t, <<)
-_packed_shift(psrl_s_u8x4, uint8x4_t, >>)
-_packed_shift(psrl_s_u16x2, uint16x2_t, >>)
-_packed_shift(psra_s_i8x4, int8x4_t, >>)
-_packed_shift(psra_s_i16x2, int16x2_t, >>)
+__packed_shift(psll_s_u8x4, uint8x4_t, <<)
+__packed_shift(psll_s_i8x4, int8x4_t, <<)
+__packed_shift(psll_s_u16x2, uint16x2_t, <<)
+__packed_shift(psll_s_i16x2, int16x2_t, <<)
+__packed_shift(psrl_s_u8x4, uint8x4_t, >>)
+__packed_shift(psrl_s_u16x2, uint16x2_t, >>)
+__packed_shift(psra_s_i8x4, int8x4_t, >>)
+__packed_shift(psra_s_i16x2, int16x2_t, >>)
 
 /* Packed Shifts (64-bit) */
-_packed_shift(psll_s_u8x8, uint8x8_t, <<)
-_packed_shift(psll_s_i8x8, int8x8_t, <<)
-_packed_shift(psll_s_u16x4, uint16x4_t, <<)
-_packed_shift(psll_s_i16x4, int16x4_t, <<)
-_packed_shift(psll_s_u32x2, uint32x2_t, <<)
-_packed_shift(psll_s_i32x2, int32x2_t, <<)
-_packed_shift(psrl_s_u8x8, uint8x8_t, >>)
-_packed_shift(psrl_s_u16x4, uint16x4_t, >>)
-_packed_shift(psrl_s_u32x2, uint32x2_t, >>)
-_packed_shift(psra_s_i8x8, int8x8_t, >>)
-_packed_shift(psra_s_i16x4, int16x4_t, >>)
-_packed_shift(psra_s_i32x2, int32x2_t, >>)
+__packed_shift(psll_s_u8x8, uint8x8_t, <<)
+__packed_shift(psll_s_i8x8, int8x8_t, <<)
+__packed_shift(psll_s_u16x4, uint16x4_t, <<)
+__packed_shift(psll_s_i16x4, int16x4_t, <<)
+__packed_shift(psll_s_u32x2, uint32x2_t, <<)
+__packed_shift(psll_s_i32x2, int32x2_t, <<)
+__packed_shift(psrl_s_u8x8, uint8x8_t, >>)
+__packed_shift(psrl_s_u16x4, uint16x4_t, >>)
+__packed_shift(psrl_s_u32x2, uint32x2_t, >>)
+__packed_shift(psra_s_i8x8, int8x8_t, >>)
+__packed_shift(psra_s_i16x4, int16x4_t, >>)
+__packed_shift(psra_s_i32x2, int32x2_t, >>)
 
-#undef _packed_addsub
-#undef _packed_shift
-#undef _packed_binop
+#undef __packed_addsub
+#undef __packed_shift
+#undef __packed_binop
 
 #if defined(__cplusplus)
 }

>From 8a0e30da6dc00aa1affdf65866632c1d316a63d6 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Thu, 12 Mar 2026 19:44:17 +0800
Subject: [PATCH 06/19] [Clang][RISCV] Mask shift amounts in P extension
 intrinsics to avoid UB

---
 clang/lib/Headers/riscv_packed.h          |  60 +++--
 clang/test/CodeGen/RISCV/rvp-intrinsics.c | 312 ++++++++++++----------
 2 files changed, 210 insertions(+), 162 deletions(-)

diff --git a/clang/lib/Headers/riscv_packed.h b/clang/lib/Headers/riscv_packed.h
index b201c1f1d3f0b..50095bef7ddb3 100644
--- a/clang/lib/Headers/riscv_packed.h
+++ b/clang/lib/Headers/riscv_packed.h
@@ -30,14 +30,20 @@ typedef uint16_t uint16x4_t __attribute__((__vector_size__(8), __aligned__(8)));
 typedef int32_t int32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
 typedef uint32_t uint32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
 
-#define __packed_binop(name, retty, ty1, ty2, op)                              \
-  static __inline__ retty __attribute__((__always_inline__, __nodebug__))      \
-  __riscv_##name(ty1 __rs1, ty2 __rs2) {                                       \
+#define __packed_addsub(name, ty, op)                                          \
+  static __inline__ ty __attribute__((__always_inline__, __nodebug__))         \
+  __riscv_##name(ty __rs1, ty __rs2) {                                         \
     return __rs1 op __rs2;                                                     \
   }
 
-#define __packed_addsub(name, ty, op) __packed_binop(name, ty, ty, ty, op)
-#define __packed_shift(name, ty, op) __packed_binop(name, ty, ty, unsigned, op)
+#define __packed_shift(name, ty, op, mask)                                     \
+  static __inline__ ty __attribute__((__always_inline__, __nodebug__))         \
+  __riscv_##name(ty __rs1, unsigned __rs2) {                                   \
+    return __rs1 op (__rs2 & (mask));                                          \
+  }
+#define __packed_shift8(name, ty, op) __packed_shift(name, ty, op, 0x7)
+#define __packed_shift16(name, ty, op) __packed_shift(name, ty, op, 0xf)
+#define __packed_shift32(name, ty, op) __packed_shift(name, ty, op, 0x1f)
 
 /* Packed Addition and Subtraction (32-bit) */
 __packed_addsub(padd_i8x4, int8x4_t, +)
@@ -64,32 +70,34 @@ __packed_addsub(psub_i32x2, int32x2_t, -)
 __packed_addsub(psub_u32x2, uint32x2_t, -)
 
 /* Packed Shifts (32-bit) */
-__packed_shift(psll_s_u8x4, uint8x4_t, <<)
-__packed_shift(psll_s_i8x4, int8x4_t, <<)
-__packed_shift(psll_s_u16x2, uint16x2_t, <<)
-__packed_shift(psll_s_i16x2, int16x2_t, <<)
-__packed_shift(psrl_s_u8x4, uint8x4_t, >>)
-__packed_shift(psrl_s_u16x2, uint16x2_t, >>)
-__packed_shift(psra_s_i8x4, int8x4_t, >>)
-__packed_shift(psra_s_i16x2, int16x2_t, >>)
+__packed_shift8(psll_s_u8x4, uint8x4_t, <<)
+__packed_shift8(psll_s_i8x4, int8x4_t, <<)
+__packed_shift16(psll_s_u16x2, uint16x2_t, <<)
+__packed_shift16(psll_s_i16x2, int16x2_t, <<)
+__packed_shift8(psrl_s_u8x4, uint8x4_t, >>)
+__packed_shift16(psrl_s_u16x2, uint16x2_t, >>)
+__packed_shift8(psra_s_i8x4, int8x4_t, >>)
+__packed_shift16(psra_s_i16x2, int16x2_t, >>)
 
 /* Packed Shifts (64-bit) */
-__packed_shift(psll_s_u8x8, uint8x8_t, <<)
-__packed_shift(psll_s_i8x8, int8x8_t, <<)
-__packed_shift(psll_s_u16x4, uint16x4_t, <<)
-__packed_shift(psll_s_i16x4, int16x4_t, <<)
-__packed_shift(psll_s_u32x2, uint32x2_t, <<)
-__packed_shift(psll_s_i32x2, int32x2_t, <<)
-__packed_shift(psrl_s_u8x8, uint8x8_t, >>)
-__packed_shift(psrl_s_u16x4, uint16x4_t, >>)
-__packed_shift(psrl_s_u32x2, uint32x2_t, >>)
-__packed_shift(psra_s_i8x8, int8x8_t, >>)
-__packed_shift(psra_s_i16x4, int16x4_t, >>)
-__packed_shift(psra_s_i32x2, int32x2_t, >>)
+__packed_shift8(psll_s_u8x8, uint8x8_t, <<)
+__packed_shift8(psll_s_i8x8, int8x8_t, <<)
+__packed_shift16(psll_s_u16x4, uint16x4_t, <<)
+__packed_shift16(psll_s_i16x4, int16x4_t, <<)
+__packed_shift32(psll_s_u32x2, uint32x2_t, <<)
+__packed_shift32(psll_s_i32x2, int32x2_t, <<)
+__packed_shift8(psrl_s_u8x8, uint8x8_t, >>)
+__packed_shift16(psrl_s_u16x4, uint16x4_t, >>)
+__packed_shift32(psrl_s_u32x2, uint32x2_t, >>)
+__packed_shift8(psra_s_i8x8, int8x8_t, >>)
+__packed_shift16(psra_s_i16x4, int16x4_t, >>)
+__packed_shift32(psra_s_i32x2, int32x2_t, >>)
 
 #undef __packed_addsub
 #undef __packed_shift
-#undef __packed_binop
+#undef __packed_shift8
+#undef __packed_shift16
+#undef __packed_shift32
 
 #if defined(__cplusplus)
 }
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index c80a6ad4e95e7..1c2899684ca39 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -483,11 +483,12 @@ uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_i8x4(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -495,11 +496,12 @@ uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
@@ -511,11 +513,12 @@ int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_u8x4(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -523,11 +526,12 @@ int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
@@ -539,11 +543,12 @@ uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_i16x2(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -551,11 +556,12 @@ uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
@@ -567,11 +573,12 @@ int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_u16x2(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -579,11 +586,12 @@ int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
@@ -595,11 +603,12 @@ uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psra_s_i8x4(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -607,11 +616,12 @@ uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
@@ -623,11 +633,12 @@ int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psrl_s_u8x4(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -635,11 +646,12 @@ int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
@@ -651,11 +663,12 @@ uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psra_s_i16x2(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -663,11 +676,12 @@ uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
@@ -679,11 +693,12 @@ int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psrl_s_u16x2(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -691,11 +706,12 @@ int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
@@ -709,22 +725,24 @@ uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_i8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
   return __riscv_psll_s_i8x8(a, shamt);
@@ -735,22 +753,24 @@ int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_u8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
   return __riscv_psll_s_u8x8(a, shamt);
@@ -761,22 +781,24 @@ uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_i16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
   return __riscv_psll_s_i16x4(a, shamt);
@@ -787,22 +809,24 @@ int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_u16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
   return __riscv_psll_s_u16x4(a, shamt);
@@ -812,7 +836,8 @@ uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
@@ -822,7 +847,8 @@ uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
@@ -836,7 +862,8 @@ int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
@@ -846,7 +873,8 @@ int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
@@ -861,22 +889,24 @@ uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psra_s_i8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
   return __riscv_psra_s_i8x8(a, shamt);
@@ -887,22 +917,24 @@ int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psrl_s_u8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
   return __riscv_psrl_s_u8x8(a, shamt);
@@ -913,22 +945,24 @@ uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psra_s_i16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
   return __riscv_psra_s_i16x4(a, shamt);
@@ -939,22 +973,24 @@ int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psrl_s_u16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
   return __riscv_psrl_s_u16x4(a, shamt);
@@ -964,7 +1000,8 @@ uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
@@ -974,7 +1011,8 @@ uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
@@ -988,7 +1026,8 @@ int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
@@ -998,7 +1037,8 @@ int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64

>From c1de06b23fee896e06e4edf027f3cfe8336a4cf7 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Wed, 1 Apr 2026 09:33:42 +0000
Subject: [PATCH 07/19] rebase: sync with main

---
 clang/test/CodeGen/RISCV/rvp-intrinsics.c | 184 +++++++++-------------
 1 file changed, 72 insertions(+), 112 deletions(-)

diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index 1c2899684ca39..4f22e51216dd4 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -19,17 +19,14 @@
 // RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_i8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV64-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
 // RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
 int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
   return __riscv_padd_i8x4(a, b);
@@ -44,17 +41,14 @@ int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
 // RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
 // RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
 uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
   return __riscv_padd_u8x4(a, b);
@@ -69,17 +63,14 @@ uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
 // RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_i16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
 // RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
 int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
   return __riscv_padd_i16x2(a, b);
@@ -94,17 +85,14 @@ int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
 // RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
 // RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
 uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
   return __riscv_padd_u16x2(a, b);
@@ -119,17 +107,14 @@ uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
 // RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
 // RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
 int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
   return __riscv_psub_i8x4(a, b);
@@ -144,17 +129,14 @@ int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
 // RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
 // RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
 uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
   return __riscv_psub_u8x4(a, b);
@@ -169,17 +151,14 @@ uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
 // RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
 // RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
 int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
   return __riscv_psub_i16x2(a, b);
@@ -194,17 +173,14 @@ int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
 // RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
 // RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
 uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
   return __riscv_psub_u16x2(a, b);
@@ -490,19 +466,17 @@ uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
 // RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_i8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psll_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
 // RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
 // RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
 // RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
 // RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
 int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
   return __riscv_psll_s_i8x4(a, shamt);
@@ -520,19 +494,17 @@ int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
 // RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psll_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
 // RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
 // RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
 // RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
 // RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
 uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
   return __riscv_psll_s_u8x4(a, shamt);
@@ -550,19 +522,17 @@ uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
 // RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_i16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psll_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
 // RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
 // RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
 // RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
 // RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
 int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
   return __riscv_psll_s_i16x2(a, shamt);
@@ -580,19 +550,17 @@ int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
 // RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psll_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
 // RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
 // RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
 // RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
 // RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
 uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
   return __riscv_psll_s_u16x2(a, shamt);
@@ -610,19 +578,17 @@ uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
 // RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psra_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
 // RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
 // RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
 // RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
 // RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
 int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
   return __riscv_psra_s_i8x4(a, shamt);
@@ -640,19 +606,17 @@ int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
 // RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psrl_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
 // RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
 // RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
 // RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
 // RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
 uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
   return __riscv_psrl_s_u8x4(a, shamt);
@@ -670,19 +634,17 @@ uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
 // RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psra_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
 // RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
 // RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
 // RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
 // RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
 int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
   return __riscv_psra_s_i16x2(a, shamt);
@@ -700,19 +662,17 @@ int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
 // RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psrl_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
 // RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
 // RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
 // RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
 // RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
 uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
   return __riscv_psrl_s_u16x2(a, shamt);

>From f2b32787160f35f39a6d8955f03f9e06a4c6e164 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Fri, 3 Apr 2026 01:58:33 +0000
Subject: [PATCH 08/19] [RISCV] add more P intrinsics support

pmv_s, padd_s, pneg, pand/por/pxor/pnot, pmin/pmax
TODO: padd_s codegen support
---
 clang/lib/Headers/riscv_packed.h          |  199 +-
 clang/test/CodeGen/RISCV/rvp-intrinsics.c | 2948 ++++++++++++++++-----
 2 files changed, 2483 insertions(+), 664 deletions(-)

diff --git a/clang/lib/Headers/riscv_packed.h b/clang/lib/Headers/riscv_packed.h
index 50095bef7ddb3..c7605de340faa 100644
--- a/clang/lib/Headers/riscv_packed.h
+++ b/clang/lib/Headers/riscv_packed.h
@@ -30,14 +30,19 @@ typedef uint16_t uint16x4_t __attribute__((__vector_size__(8), __aligned__(8)));
 typedef int32_t int32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
 typedef uint32_t uint32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
 
-#define __packed_addsub(name, ty, op)                                          \
-  static __inline__ ty __attribute__((__always_inline__, __nodebug__))         \
-  __riscv_##name(ty __rs1, ty __rs2) {                                         \
-    return __rs1 op __rs2;                                                     \
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+#define __packed_splat2(ty, x) ((ty){(x), (x)})
+#define __packed_splat4(ty, x) ((ty){(x), (x), (x), (x)})
+#define __packed_splat8(ty, x) ((ty){(x), (x), (x), (x), (x), (x), (x), (x)})
+
+#define __packed_splat(name, ty, scalar_ty, splat)                             \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(scalar_ty __x) {      \
+    return splat(ty, __x);                                                     \
   }
 
 #define __packed_shift(name, ty, op, mask)                                     \
-  static __inline__ ty __attribute__((__always_inline__, __nodebug__))         \
+  static __inline__ ty __DEFAULT_FN_ATTRS                                      \
   __riscv_##name(ty __rs1, unsigned __rs2) {                                   \
     return __rs1 op (__rs2 & (mask));                                          \
   }
@@ -45,29 +50,42 @@ typedef uint32_t uint32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
 #define __packed_shift16(name, ty, op) __packed_shift(name, ty, op, 0xf)
 #define __packed_shift32(name, ty, op) __packed_shift(name, ty, op, 0x1f)
 
-/* Packed Addition and Subtraction (32-bit) */
-__packed_addsub(padd_i8x4, int8x4_t, +)
-__packed_addsub(padd_u8x4, uint8x4_t, +)
-__packed_addsub(padd_i16x2, int16x2_t, +)
-__packed_addsub(padd_u16x2, uint16x2_t, +)
-__packed_addsub(psub_i8x4, int8x4_t, -)
-__packed_addsub(psub_u8x4, uint8x4_t, -)
-__packed_addsub(psub_i16x2, int16x2_t, -)
-__packed_addsub(psub_u16x2, uint16x2_t, -)
+#define __packed_scalar_binary_op(name, ty, scalar_ty, op, splat)              \
+  static __inline__ ty __DEFAULT_FN_ATTRS                                      \
+  __riscv_##name(ty __rs1, scalar_ty __rs2) {                                  \
+    return __rs1 op splat(ty, __rs2);                                          \
+  }
 
-/* Packed Addition and Subtraction (64-bit) */
-__packed_addsub(padd_i8x8, int8x8_t, +)
-__packed_addsub(padd_u8x8, uint8x8_t, +)
-__packed_addsub(padd_i16x4, int16x4_t, +)
-__packed_addsub(padd_u16x4, uint16x4_t, +)
-__packed_addsub(padd_i32x2, int32x2_t, +)
-__packed_addsub(padd_u32x2, uint32x2_t, +)
-__packed_addsub(psub_i8x8, int8x8_t, -)
-__packed_addsub(psub_u8x8, uint8x8_t, -)
-__packed_addsub(psub_i16x4, int16x4_t, -)
-__packed_addsub(psub_u16x4, uint16x4_t, -)
-__packed_addsub(psub_i32x2, int32x2_t, -)
-__packed_addsub(psub_u32x2, uint32x2_t, -)
+#define __packed_binary_op(name, ty, op)                                       \
+  static __inline__ ty __DEFAULT_FN_ATTRS                                      \
+  __riscv_##name(ty __rs1, ty __rs2) {                                         \
+    return __rs1 op __rs2;                                                     \
+  }
+
+#define __packed_unary_op(name, ty, op)                                        \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1) {           \
+    return op __rs1;                                                           \
+  }
+
+#define __packed_minmax(name, ty, builtin)                                     \
+  static __inline__ ty __DEFAULT_FN_ATTRS                                      \
+  __riscv_##name(ty __rs1, ty __rs2) {                                         \
+    return builtin(__rs1, __rs2);                                              \
+  }
+
+/* Packed Splat (32-bit) */
+__packed_splat(pmv_s_u8x4, uint8x4_t, uint8_t, __packed_splat4)
+__packed_splat(pmv_s_i8x4, int8x4_t, int8_t, __packed_splat4)
+__packed_splat(pmv_s_u16x2, uint16x2_t, uint16_t, __packed_splat2)
+__packed_splat(pmv_s_i16x2, int16x2_t, int16_t, __packed_splat2)
+
+/* Packed Splat (64-bit) */
+__packed_splat(pmv_s_u8x8, uint8x8_t, uint8_t, __packed_splat8)
+__packed_splat(pmv_s_i8x8, int8x8_t, int8_t, __packed_splat8)
+__packed_splat(pmv_s_u16x4, uint16x4_t, uint16_t, __packed_splat4)
+__packed_splat(pmv_s_i16x4, int16x4_t, int16_t, __packed_splat4)
+__packed_splat(pmv_s_u32x2, uint32x2_t, uint32_t, __packed_splat2)
+__packed_splat(pmv_s_i32x2, int32x2_t, int32_t, __packed_splat2)
 
 /* Packed Shifts (32-bit) */
 __packed_shift8(psll_s_u8x4, uint8x4_t, <<)
@@ -93,11 +111,136 @@ __packed_shift8(psra_s_i8x8, int8x8_t, >>)
 __packed_shift16(psra_s_i16x4, int16x4_t, >>)
 __packed_shift32(psra_s_i32x2, int32x2_t, >>)
 
-#undef __packed_addsub
+/* Packed Addition with Scalar (32-bit) */
+__packed_scalar_binary_op(padd_s_u8x4, uint8x4_t, uint8_t, +, __packed_splat4)
+__packed_scalar_binary_op(padd_s_i8x4, int8x4_t, int8_t, +, __packed_splat4)
+__packed_scalar_binary_op(padd_s_u16x2, uint16x2_t, uint16_t, +,
+                          __packed_splat2)
+__packed_scalar_binary_op(padd_s_i16x2, int16x2_t, int16_t, +,
+                          __packed_splat2)
+
+/* Packed Addition with Scalar (64-bit) */
+__packed_scalar_binary_op(padd_s_u8x8, uint8x8_t, uint8_t, +, __packed_splat8)
+__packed_scalar_binary_op(padd_s_i8x8, int8x8_t, int8_t, +, __packed_splat8)
+__packed_scalar_binary_op(padd_s_u16x4, uint16x4_t, uint16_t, +,
+                          __packed_splat4)
+__packed_scalar_binary_op(padd_s_i16x4, int16x4_t, int16_t, +,
+                          __packed_splat4)
+__packed_scalar_binary_op(padd_s_u32x2, uint32x2_t, uint32_t, +,
+                          __packed_splat2)
+__packed_scalar_binary_op(padd_s_i32x2, int32x2_t, int32_t, +,
+                          __packed_splat2)
+
+/* Packed Addition and Subtraction (32-bit) */
+__packed_binary_op(padd_i8x4, int8x4_t, +)
+__packed_binary_op(padd_u8x4, uint8x4_t, +)
+__packed_binary_op(padd_i16x2, int16x2_t, +)
+__packed_binary_op(padd_u16x2, uint16x2_t, +)
+__packed_binary_op(psub_i8x4, int8x4_t, -)
+__packed_binary_op(psub_u8x4, uint8x4_t, -)
+__packed_binary_op(psub_i16x2, int16x2_t, -)
+__packed_binary_op(psub_u16x2, uint16x2_t, -)
+__packed_unary_op(pneg_i8x4, int8x4_t, -)
+__packed_unary_op(pneg_i16x2, int16x2_t, -)
+
+/* Packed Addition and Subtraction (64-bit) */
+__packed_binary_op(padd_i8x8, int8x8_t, +)
+__packed_binary_op(padd_u8x8, uint8x8_t, +)
+__packed_binary_op(padd_i16x4, int16x4_t, +)
+__packed_binary_op(padd_u16x4, uint16x4_t, +)
+__packed_binary_op(padd_i32x2, int32x2_t, +)
+__packed_binary_op(padd_u32x2, uint32x2_t, +)
+__packed_binary_op(psub_i8x8, int8x8_t, -)
+__packed_binary_op(psub_u8x8, uint8x8_t, -)
+__packed_binary_op(psub_i16x4, int16x4_t, -)
+__packed_binary_op(psub_u16x4, uint16x4_t, -)
+__packed_binary_op(psub_i32x2, int32x2_t, -)
+__packed_binary_op(psub_u32x2, uint32x2_t, -)
+__packed_unary_op(pneg_i8x8, int8x8_t, -)
+__packed_unary_op(pneg_i16x4, int16x4_t, -)
+__packed_unary_op(pneg_i32x2, int32x2_t, -)
+
+/* Packed Minimum and Maximum (32-bit) */
+__packed_minmax(pmin_i8x4, int8x4_t, __builtin_elementwise_min)
+__packed_minmax(pmin_i16x2, int16x2_t, __builtin_elementwise_min)
+__packed_minmax(pminu_u8x4, uint8x4_t, __builtin_elementwise_min)
+__packed_minmax(pminu_u16x2, uint16x2_t, __builtin_elementwise_min)
+__packed_minmax(pmax_i8x4, int8x4_t, __builtin_elementwise_max)
+__packed_minmax(pmax_i16x2, int16x2_t, __builtin_elementwise_max)
+__packed_minmax(pmaxu_u8x4, uint8x4_t, __builtin_elementwise_max)
+__packed_minmax(pmaxu_u16x2, uint16x2_t, __builtin_elementwise_max)
+
+/* Packed Minimum and Maximum (64-bit) */
+__packed_minmax(pmin_i8x8, int8x8_t, __builtin_elementwise_min)
+__packed_minmax(pmin_i16x4, int16x4_t, __builtin_elementwise_min)
+__packed_minmax(pmin_i32x2, int32x2_t, __builtin_elementwise_min)
+__packed_minmax(pminu_u8x8, uint8x8_t, __builtin_elementwise_min)
+__packed_minmax(pminu_u16x4, uint16x4_t, __builtin_elementwise_min)
+__packed_minmax(pminu_u32x2, uint32x2_t, __builtin_elementwise_min)
+__packed_minmax(pmax_i8x8, int8x8_t, __builtin_elementwise_max)
+__packed_minmax(pmax_i16x4, int16x4_t, __builtin_elementwise_max)
+__packed_minmax(pmax_i32x2, int32x2_t, __builtin_elementwise_max)
+__packed_minmax(pmaxu_u8x8, uint8x8_t, __builtin_elementwise_max)
+__packed_minmax(pmaxu_u16x4, uint16x4_t, __builtin_elementwise_max)
+__packed_minmax(pmaxu_u32x2, uint32x2_t, __builtin_elementwise_max)
+
+/* Packed Logical Operations (32-bit) */
+__packed_binary_op(pand_i8x4, int8x4_t, &)
+__packed_binary_op(pand_u8x4, uint8x4_t, &)
+__packed_binary_op(pand_i16x2, int16x2_t, &)
+__packed_binary_op(pand_u16x2, uint16x2_t, &)
+__packed_binary_op(por_i8x4, int8x4_t, |)
+__packed_binary_op(por_u8x4, uint8x4_t, |)
+__packed_binary_op(por_i16x2, int16x2_t, |)
+__packed_binary_op(por_u16x2, uint16x2_t, |)
+__packed_binary_op(pxor_i8x4, int8x4_t, ^)
+__packed_binary_op(pxor_u8x4, uint8x4_t, ^)
+__packed_binary_op(pxor_i16x2, int16x2_t, ^)
+__packed_binary_op(pxor_u16x2, uint16x2_t, ^)
+__packed_unary_op(pnot_i8x4, int8x4_t, ~)
+__packed_unary_op(pnot_u8x4, uint8x4_t, ~)
+__packed_unary_op(pnot_i16x2, int16x2_t, ~)
+__packed_unary_op(pnot_u16x2, uint16x2_t, ~)
+
+/* Packed Logical Operations (64-bit) */
+__packed_binary_op(pand_i8x8, int8x8_t, &)
+__packed_binary_op(pand_u8x8, uint8x8_t, &)
+__packed_binary_op(pand_i16x4, int16x4_t, &)
+__packed_binary_op(pand_u16x4, uint16x4_t, &)
+__packed_binary_op(pand_i32x2, int32x2_t, &)
+__packed_binary_op(pand_u32x2, uint32x2_t, &)
+__packed_binary_op(por_i8x8, int8x8_t, |)
+__packed_binary_op(por_u8x8, uint8x8_t, |)
+__packed_binary_op(por_i16x4, int16x4_t, |)
+__packed_binary_op(por_u16x4, uint16x4_t, |)
+__packed_binary_op(por_i32x2, int32x2_t, |)
+__packed_binary_op(por_u32x2, uint32x2_t, |)
+__packed_binary_op(pxor_i8x8, int8x8_t, ^)
+__packed_binary_op(pxor_u8x8, uint8x8_t, ^)
+__packed_binary_op(pxor_i16x4, int16x4_t, ^)
+__packed_binary_op(pxor_u16x4, uint16x4_t, ^)
+__packed_binary_op(pxor_i32x2, int32x2_t, ^)
+__packed_binary_op(pxor_u32x2, uint32x2_t, ^)
+__packed_unary_op(pnot_i8x8, int8x8_t, ~)
+__packed_unary_op(pnot_u8x8, uint8x8_t, ~)
+__packed_unary_op(pnot_i16x4, int16x4_t, ~)
+__packed_unary_op(pnot_u16x4, uint16x4_t, ~)
+__packed_unary_op(pnot_i32x2, int32x2_t, ~)
+__packed_unary_op(pnot_u32x2, uint32x2_t, ~)
+
+#undef __packed_splat2
+#undef __packed_splat4
+#undef __packed_splat8
+#undef __packed_splat
 #undef __packed_shift
 #undef __packed_shift8
 #undef __packed_shift16
 #undef __packed_shift32
+#undef __packed_scalar_binary_op
+#undef __packed_binary_op
+#undef __packed_unary_op
+#undef __packed_minmax
+#undef __DEFAULT_FN_ATTRS
 
 #if defined(__cplusplus)
 }
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index 4f22e51216dd4..e79c98dfd93a5 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -8,1002 +8,2678 @@
 
 #include <riscv_packed.h>
 
-/* 32-bit Packed Addition and Subtraction */
+/* Packed Splat (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_pmv_s_u8x4(
+// RV32-SAME: i8 noundef zeroext [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[VECINIT3_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmv_s_u8x4(
+// RV64-SAME: i8 noundef zeroext [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[VECINIT3_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP0]]
+//
+uint8x4_t test_pmv_s_u8x4(uint8_t x) {
+  return __riscv_pmv_s_u8x4(x);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmv_s_i8x4(
+// RV32-SAME: i8 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[VECINIT3_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmv_s_i8x4(
+// RV64-SAME: i8 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[VECINIT3_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP0]]
+//
+int8x4_t test_pmv_s_i8x4(int8_t x) {
+  return __riscv_pmv_s_i8x4(x);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmv_s_u16x2(
+// RV32-SAME: i16 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i16> [[VECINIT1_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmv_s_u16x2(
+// RV64-SAME: i16 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i16> [[VECINIT1_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP0]]
+//
+uint16x2_t test_pmv_s_u16x2(uint16_t x) {
+  return __riscv_pmv_s_u16x2(x);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmv_s_i16x2(
+// RV32-SAME: i16 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i16> [[VECINIT1_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmv_s_i16x2(
+// RV64-SAME: i16 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i16> [[VECINIT1_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP0]]
+//
+int16x2_t test_pmv_s_i16x2(int16_t x) {
+  return __riscv_pmv_s_i16x2(x);
+}
+
+/* Packed Splat (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_u8x8(
+// RV32-SAME: i8 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[VECINIT7_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_u8x8(
+// RV64-SAME: i8 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[VECINIT7_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP0]]
+//
+uint8x8_t test_pmv_s_u8x8(uint8_t x) {
+  return __riscv_pmv_s_u8x8(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_i8x8(
+// RV32-SAME: i8 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[VECINIT7_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_i8x8(
+// RV64-SAME: i8 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[VECINIT7_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP0]]
+//
+int8x8_t test_pmv_s_i8x8(int8_t x) {
+  return __riscv_pmv_s_i8x8(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_u16x4(
+// RV32-SAME: i16 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_u16x4(
+// RV64-SAME: i16 noundef zeroext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP0]]
+//
+uint16x4_t test_pmv_s_u16x4(uint16_t x) {
+  return __riscv_pmv_s_u16x4(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_i16x4(
+// RV32-SAME: i16 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_i16x4(
+// RV64-SAME: i16 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP0]]
+//
+int16x4_t test_pmv_s_i16x4(int16_t x) {
+  return __riscv_pmv_s_i16x4(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_u32x2(
+// RV32-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_u32x2(
+// RV64-SAME: i32 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP0]]
+//
+uint32x2_t test_pmv_s_u32x2(uint32_t x) {
+  return __riscv_pmv_s_u32x2(x);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmv_s_i32x2(
+// RV32-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i64 0
+// RV32-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP0]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmv_s_i32x2(
+// RV64-SAME: i32 noundef signext [[X:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i64 0
+// RV64-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP0]]
+//
+int32x2_t test_pmv_s_i32x2(int32_t x) {
+  return __riscv_pmv_s_i32x2(x);
+}
+
+/* Packed Shifts (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psll_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
+  return __riscv_psll_s_i8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psll_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
+  return __riscv_psll_s_u8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psll_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
+  return __riscv_psll_s_i16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psll_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
+  return __riscv_psll_s_u16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psra_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psra_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
+  return __riscv_psra_s_i8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psrl_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
+  return __riscv_psrl_s_u8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psra_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psra_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
+  return __riscv_psra_s_i16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i32 @test_psrl_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
+//
+uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
+  return __riscv_psrl_s_u16x2(a, shamt);
+}
+
+/* Packed Shifts (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
+  return __riscv_psll_s_i8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
+  return __riscv_psll_s_u8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
+  return __riscv_psll_s_i16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
+  return __riscv_psll_s_u16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
+  return __riscv_psll_s_i32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned shamt) {
+  return __riscv_psll_s_u32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
+  return __riscv_psra_s_i8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
+  return __riscv_psrl_s_u8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
+  return __riscv_psra_s_i16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
+//
+uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
+  return __riscv_psrl_s_u16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
+  return __riscv_psra_s_i32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned shamt) {
+  return __riscv_psrl_s_u32x2(a, shamt);
+}
+
+/* Packed Addition with Scalar (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint8x4_t test_padd_s_u8x4(uint8x4_t a, uint8_t b) {
+  return __riscv_padd_s_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int8x4_t test_padd_s_i8x4(int8x4_t a, int8_t b) {
+  return __riscv_padd_s_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[VECINIT2_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[VECINIT2_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint16x2_t test_padd_s_u16x2(uint16x2_t a, uint16_t b) {
+  return __riscv_padd_s_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[VECINIT2_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i16> poison, i16 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i16> [[VECINIT_I]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[VECINIT2_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int16x2_t test_padd_s_i16x2(int16x2_t a, int16_t b) {
+  return __riscv_padd_s_i16x2(a, b);
+}
+
+/* Packed Addition with Scalar (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[VECINIT8_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[VECINIT8_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint8x8_t test_padd_s_u8x8(uint8x8_t a, uint8_t b) {
+  return __riscv_padd_s_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[VECINIT8_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[VECINIT8_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int8x8_t test_padd_s_i8x8(int8x8_t a, int8_t b) {
+  return __riscv_padd_s_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint16x4_t test_padd_s_u16x4(uint16x4_t a, uint16_t b) {
+  return __riscv_padd_s_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int16x4_t test_padd_s_i16x4(int16x4_t a, int16_t b) {
+  return __riscv_padd_s_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[VECINIT2_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[VECINIT2_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint32x2_t test_padd_s_u32x2(uint32x2_t a, uint32_t b) {
+  return __riscv_padd_s_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[VECINIT2_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[VECINIT2_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_padd_s_i32x2(int32x2_t a, int32_t b) {
+  return __riscv_padd_s_i32x2(a, b);
+}
+
+/* Packed Addition and Subtraction (32-bit) */
 
 // RV32-LABEL: define dso_local i32 @test_padd_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_padd_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_padd_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_padd_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_padd_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_psub_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_psub_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psub_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psub_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pneg_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pneg_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int8x4_t test_pneg_i8x4(int8x4_t a) {
+  return __riscv_pneg_i8x4(a);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pneg_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pneg_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int16x2_t test_pneg_i16x2(int16x2_t a) {
+  return __riscv_pneg_i16x2(a);
+}
+
+/* Packed Addition and Subtraction (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_padd_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_padd_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_padd_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_padd_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_padd_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_padd_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_psub_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_padd_i8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV64-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP2]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
-  return __riscv_padd_i8x4(a, b);
+uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_psub_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV32-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psub_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psub_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psub_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psub_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pneg_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pneg_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int8x8_t test_pneg_i8x8(int8x8_t a) {
+  return __riscv_pneg_i8x8(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pneg_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pneg_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int16x4_t test_pneg_i16x4(int16x4_t a) {
+  return __riscv_pneg_i16x4(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pneg_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pneg_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_pneg_i32x2(int32x2_t a) {
+  return __riscv_pneg_i32x2(a);
+}
+
+/* Packed Minimum and Maximum (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_pmin_i8x4(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV64-LABEL: define dso_local i32 @test_pmin_i8x4(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
-  return __riscv_padd_u8x4(a, b);
+int8x4_t test_pmin_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pmin_i8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV32-LABEL: define dso_local i32 @test_pmin_i16x2(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV64-LABEL: define dso_local i32 @test_pmin_i16x2(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
-  return __riscv_padd_i16x2(a, b);
+int16x2_t test_pmin_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pmin_i16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV32-LABEL: define dso_local i32 @test_pminu_u8x4(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV64-LABEL: define dso_local i32 @test_pminu_u8x4(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
-  return __riscv_padd_u16x2(a, b);
+uint8x4_t test_pminu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pminu_u8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV32-LABEL: define dso_local i32 @test_pminu_u16x2(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV64-LABEL: define dso_local i32 @test_pminu_u16x2(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
-  return __riscv_psub_i8x4(a, b);
+uint16x2_t test_pminu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pminu_u16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV32-LABEL: define dso_local i32 @test_pmax_i8x4(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV64-LABEL: define dso_local i32 @test_pmax_i8x4(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
-  return __riscv_psub_u8x4(a, b);
+int8x4_t test_pmax_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pmax_i8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV32-LABEL: define dso_local i32 @test_pmax_i16x2(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV64-LABEL: define dso_local i32 @test_pmax_i16x2(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
-  return __riscv_psub_i16x2(a, b);
+int16x2_t test_pmax_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pmax_i16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV32-LABEL: define dso_local i32 @test_pmaxu_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pmaxu_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint8x4_t test_pmaxu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pmaxu_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pmaxu_u16x2(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV64-LABEL: define dso_local i32 @test_pmaxu_u16x2(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
-  return __riscv_psub_u16x2(a, b);
+uint16x2_t test_pmaxu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pmaxu_u16x2(a, b);
+}
+
+/* Packed Minimum and Maximum (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_pmin_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmin_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_pmin_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pmin_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmin_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmin_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_pmin_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pmin_i16x4(a, b);
 }
 
-/* 64-bit Packed Addition and Subtraction */
-
-// RV32-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV32-LABEL: define dso_local i64 @test_pmin_i32x2(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV64-LABEL: define dso_local i64 @test_pmin_i32x2(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
-  return __riscv_padd_i8x8(a, b);
+int32x2_t test_pmin_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pmin_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV32-LABEL: define dso_local i64 @test_pminu_u8x8(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV64-LABEL: define dso_local i64 @test_pminu_u8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
-  return __riscv_padd_u8x8(a, b);
+uint8x8_t test_pminu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pminu_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV32-LABEL: define dso_local i64 @test_pminu_u16x4(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV64-LABEL: define dso_local i64 @test_pminu_u16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
-  return __riscv_padd_i16x4(a, b);
+uint16x4_t test_pminu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pminu_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV32-LABEL: define dso_local i64 @test_pminu_u32x2(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV64-LABEL: define dso_local i64 @test_pminu_u32x2(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
-  return __riscv_padd_u16x4(a, b);
+uint32x2_t test_pminu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pminu_u32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV32-LABEL: define dso_local i64 @test_pmax_i8x8(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV64-LABEL: define dso_local i64 @test_pmax_i8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
-  return __riscv_padd_i32x2(a, b);
+int8x8_t test_pmax_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pmax_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV32-LABEL: define dso_local i64 @test_pmax_i16x4(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.smax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV64-LABEL: define dso_local i64 @test_pmax_i16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.smax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
-  return __riscv_padd_u32x2(a, b);
+int16x4_t test_pmax_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pmax_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV32-LABEL: define dso_local i64 @test_pmax_i32x2(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV64-LABEL: define dso_local i64 @test_pmax_i32x2(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
-  return __riscv_psub_i8x8(a, b);
+int32x2_t test_pmax_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pmax_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV32-LABEL: define dso_local i64 @test_pmaxu_u8x8(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV64-LABEL: define dso_local i64 @test_pmaxu_u8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
-  return __riscv_psub_u8x8(a, b);
+uint8x8_t test_pmaxu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pmaxu_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV32-LABEL: define dso_local i64 @test_pmaxu_u16x4(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.umax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV64-LABEL: define dso_local i64 @test_pmaxu_u16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.umax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_pmaxu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pmaxu_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pmaxu_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pmaxu_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
-  return __riscv_psub_i16x4(a, b);
+uint32x2_t test_pmaxu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pmaxu_u32x2(a, b);
+}
+
+/* Packed Logical Operations (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_pand_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pand_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[AND_I1]]
+//
+int8x4_t test_pand_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pand_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pand_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pand_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[AND_I1]]
+//
+uint8x4_t test_pand_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pand_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pand_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pand_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[AND_I1]]
+//
+int16x2_t test_pand_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pand_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pand_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[AND_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pand_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[AND_I1]]
+//
+uint16x2_t test_pand_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pand_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_por_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_por_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[OR_I1]]
+//
+int8x4_t test_por_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_por_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_por_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_por_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[OR_I1]]
+//
+uint8x4_t test_por_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_por_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_por_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_por_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[OR_I1]]
+//
+int16x2_t test_por_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_por_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_por_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_por_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[OR_I1]]
+//
+uint16x2_t test_por_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_por_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pxor_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pxor_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[XOR_I1]]
+//
+int8x4_t test_pxor_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pxor_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pxor_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pxor_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[XOR_I1]]
+//
+uint8x4_t test_pxor_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pxor_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pxor_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pxor_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[XOR_I1]]
+//
+int16x2_t test_pxor_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pxor_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pxor_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i32 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pxor_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i32 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i32 [[XOR_I1]]
+//
+uint16x2_t test_pxor_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pxor_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pnot_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <4 x i8> [[TMP0]], splat (i8 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[NOT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pnot_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <4 x i8> [[TMP0]], splat (i8 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[NOT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int8x4_t test_pnot_i8x4(int8x4_t a) {
+  return __riscv_pnot_i8x4(a);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pnot_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <4 x i8> [[TMP0]], splat (i8 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[NOT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pnot_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <4 x i8> [[TMP0]], splat (i8 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[NOT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint8x4_t test_pnot_u8x4(uint8x4_t a) {
+  return __riscv_pnot_u8x4(a);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pnot_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <2 x i16> [[TMP0]], splat (i16 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[NOT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pnot_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <2 x i16> [[TMP0]], splat (i16 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[NOT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int16x2_t test_pnot_i16x2(int16x2_t a) {
+  return __riscv_pnot_i16x2(a);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pnot_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <2 x i16> [[TMP0]], splat (i16 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[NOT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pnot_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <2 x i16> [[TMP0]], splat (i16 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[NOT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint16x2_t test_pnot_u16x2(uint16x2_t a) {
+  return __riscv_pnot_u16x2(a);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_u16x4(
+/* Packed Logical Operations (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_pand_i8x8(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[AND_I1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV64-LABEL: define dso_local i64 @test_pand_i8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[AND_I1]]
 //
-uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
-  return __riscv_psub_u16x4(a, b);
+int8x8_t test_pand_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pand_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV32-LABEL: define dso_local i64 @test_pand_u8x8(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[AND_I1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV64-LABEL: define dso_local i64 @test_pand_u8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[AND_I1]]
 //
-int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
-  return __riscv_psub_i32x2(a, b);
+uint8x8_t test_pand_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pand_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV32-LABEL: define dso_local i64 @test_pand_i16x4(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[AND_I1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV64-LABEL: define dso_local i64 @test_pand_i16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[AND_I1]]
 //
-uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
-  return __riscv_psub_u32x2(a, b);
+int16x4_t test_pand_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pand_i16x4(a, b);
 }
 
-/* 32-bit Packed Shifts */
-
-// RV32-LABEL: define dso_local i32 @test_psll_s_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pand_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[AND_I1]]
 //
-// RV64-LABEL: define dso_local i32 @test_psll_s_i8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pand_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[AND_I1]]
 //
-int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
-  return __riscv_psll_s_i8x4(a, shamt);
+uint16x4_t test_pand_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pand_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psll_s_u8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pand_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[AND_I1]]
 //
-// RV64-LABEL: define dso_local i32 @test_psll_s_u8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pand_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[AND_I1]]
 //
-uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
-  return __riscv_psll_s_u8x4(a, shamt);
+int32x2_t test_pand_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pand_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psll_s_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pand_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[AND_I1]]
 //
-// RV64-LABEL: define dso_local i32 @test_psll_s_i16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pand_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[AND_I1:%.*]] = and i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[AND_I1]]
 //
-int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
-  return __riscv_psll_s_i16x2(a, shamt);
+uint32x2_t test_pand_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pand_u32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psll_s_u16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_por_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[OR_I1]]
 //
-// RV64-LABEL: define dso_local i32 @test_psll_s_u16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_por_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[OR_I1]]
 //
-uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
-  return __riscv_psll_s_u16x2(a, shamt);
+int8x8_t test_por_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_por_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psra_s_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_por_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[OR_I1]]
 //
-// RV64-LABEL: define dso_local i32 @test_psra_s_i8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_por_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[OR_I1]]
 //
-int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
-  return __riscv_psra_s_i8x4(a, shamt);
+uint8x8_t test_por_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_por_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psrl_s_u8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_por_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[OR_I1]]
 //
-// RV64-LABEL: define dso_local i32 @test_psrl_s_u8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_por_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[OR_I1]]
 //
-uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
-  return __riscv_psrl_s_u8x4(a, shamt);
+int16x4_t test_por_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_por_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psra_s_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_por_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[OR_I1]]
 //
-// RV64-LABEL: define dso_local i32 @test_psra_s_i16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_por_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[OR_I1]]
 //
-int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
-  return __riscv_psra_s_i16x2(a, shamt);
+uint16x4_t test_por_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_por_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psrl_s_u16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_por_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[OR_I1]]
 //
-// RV64-LABEL: define dso_local i32 @test_psrl_s_u16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_por_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[OR_I1]]
 //
-uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
-  return __riscv_psrl_s_u16x2(a, shamt);
+int32x2_t test_por_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_por_i32x2(a, b);
 }
 
-/* 64-bit Packed Shifts */
-
-// RV32-LABEL: define dso_local i64 @test_psll_s_i8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_por_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[OR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_por_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[OR_I1:%.*]] = or i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[OR_I1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_i8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+uint32x2_t test_por_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_por_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pxor_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[XOR_I1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pxor_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[XOR_I1]]
 //
-int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
-  return __riscv_psll_s_i8x8(a, shamt);
+int8x8_t test_pxor_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pxor_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_u8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pxor_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[XOR_I1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pxor_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[XOR_I1]]
 //
-uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
-  return __riscv_psll_s_u8x8(a, shamt);
+uint8x8_t test_pxor_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pxor_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pxor_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[XOR_I1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pxor_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[XOR_I1]]
 //
-int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
-  return __riscv_psll_s_i16x4(a, shamt);
+int16x4_t test_pxor_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pxor_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_u16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pxor_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[XOR_I1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pxor_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[XOR_I1]]
 //
-uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
-  return __riscv_psll_s_u16x4(a, shamt);
+uint16x4_t test_pxor_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pxor_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pxor_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[XOR_I1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pxor_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[XOR_I1]]
 //
-int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
-  return __riscv_psll_s_i32x2(a, shamt);
+int32x2_t test_pxor_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pxor_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_u32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pxor_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV32-NEXT:    ret i64 [[XOR_I1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pxor_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[XOR_I1:%.*]] = xor i64 [[A_COERCE]], [[B_COERCE]]
+// RV64-NEXT:    ret i64 [[XOR_I1]]
 //
-uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned shamt) {
-  return __riscv_psll_s_u32x2(a, shamt);
+uint32x2_t test_pxor_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pxor_u32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psra_s_i8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pnot_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[TMP0]], splat (i8 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[NOT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pnot_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[TMP0]], splat (i8 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[NOT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
-int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
-  return __riscv_psra_s_i8x8(a, shamt);
+int8x8_t test_pnot_i8x8(int8x8_t a) {
+  return __riscv_pnot_i8x8(a);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psrl_s_u8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pnot_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[TMP0]], splat (i8 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[NOT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pnot_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[TMP0]], splat (i8 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[NOT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
-uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
-  return __riscv_psrl_s_u8x8(a, shamt);
+uint8x8_t test_pnot_u8x8(uint8x8_t a) {
+  return __riscv_pnot_u8x8(a);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psra_s_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pnot_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[TMP0]], splat (i16 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[NOT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pnot_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[TMP0]], splat (i16 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[NOT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
-int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
-  return __riscv_psra_s_i16x4(a, shamt);
+int16x4_t test_pnot_i16x4(int16x4_t a) {
+  return __riscv_pnot_i16x4(a);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psrl_s_u16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pnot_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[TMP0]], splat (i16 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[NOT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pnot_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[TMP0]], splat (i16 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[NOT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
-uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
-  return __riscv_psrl_s_u16x4(a, shamt);
+uint16x4_t test_pnot_u16x4(uint16x4_t a) {
+  return __riscv_pnot_u16x4(a);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psra_s_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pnot_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[TMP0]], splat (i32 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[NOT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pnot_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[TMP0]], splat (i32 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[NOT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP1]]
 //
-int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
-  return __riscv_psra_s_i32x2(a, shamt);
+int32x2_t test_pnot_i32x2(int32x2_t a) {
+  return __riscv_pnot_i32x2(a);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psrl_s_u32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pnot_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[TMP0]], splat (i32 -1)
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[NOT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pnot_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[TMP0]], splat (i32 -1)
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[NOT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP1]]
 //
-uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned shamt) {
-  return __riscv_psrl_s_u32x2(a, shamt);
+uint32x2_t test_pnot_u32x2(uint32x2_t a) {
+  return __riscv_pnot_u32x2(a);
 }

>From f8ccdff499e2d3bbb4470cfda7ea83b0e2710bb0 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Wed, 20 May 2026 10:00:04 +0800
Subject: [PATCH 09/19] [cross-project-tests][RISCV][P-ext] Add riscv_packed.h
 intrinsic test

---
 .../intrinsic-header-tests/riscv_packed.c     | 1016 +++++++++++++++++
 1 file changed, 1016 insertions(+)
 create mode 100644 cross-project-tests/intrinsic-header-tests/riscv_packed.c

diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed.c b/cross-project-tests/intrinsic-header-tests/riscv_packed.c
new file mode 100644
index 0000000000000..ba4973620c2c5
--- /dev/null
+++ b/cross-project-tests/intrinsic-header-tests/riscv_packed.c
@@ -0,0 +1,1016 @@
+// REQUIRES: riscv-registered-target
+// expected-no-diagnostics
+
+// RUN: %clang %s -O2 -S -o - --target=riscv32 \
+// RUN:   -menable-experimental-extensions -march=rv32i_p0p21 -Werror \
+// RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+// RUN: %clang %s -O2 -S -o - --target=riscv64 \
+// RUN:   -menable-experimental-extensions -march=rv64i_p0p21 -Werror \
+// RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+#include <riscv_packed.h>
+
+// CHECK-LABEL: test_pmv_s_u8x4:
+// CHECK:       pmv.bs
+uint8x4_t test_pmv_s_u8x4(uint8_t x) { return __riscv_pmv_s_u8x4(x); }
+
+// CHECK-LABEL: test_pmv_s_i8x4:
+// CHECK:       pmv.bs
+int8x4_t test_pmv_s_i8x4(int8_t x) { return __riscv_pmv_s_i8x4(x); }
+
+// CHECK-LABEL: test_pmv_s_u16x2:
+// CHECK:       pmv.hs
+uint16x2_t test_pmv_s_u16x2(uint16_t x) { return __riscv_pmv_s_u16x2(x); }
+
+// CHECK-LABEL: test_pmv_s_i16x2:
+// CHECK:       pmv.hs
+int16x2_t test_pmv_s_i16x2(int16_t x) { return __riscv_pmv_s_i16x2(x); }
+
+// TODO: On RV64, the 32-bit packed constant splat emits `lui`+`addi` instead
+// of `pli.b`/`pli.h` or `plui.h`.
+// CHECK-LABEL: test_pmv_s_u8x4_imm:
+// RV32:        pli.b
+// RV64:        lui
+int8x4_t test_pmv_s_u8x4_imm(void) { return __riscv_pmv_s_u8x4(5); }
+
+// CHECK-LABEL: test_pmv_s_i8x4_imm:
+// RV32:        pli.b
+// RV64:        lui
+int8x4_t test_pmv_s_i8x4_imm(void) { return __riscv_pmv_s_i8x4(-3); }
+
+// CHECK-LABEL: test_pmv_s_u16x2_imm:
+// RV32:        pli.h
+// RV64:        lui
+uint16x2_t test_pmv_s_u16x2_imm(void) { return __riscv_pmv_s_u16x2(42); }
+
+// CHECK-LABEL: test_pmv_s_i16x2_imm:
+// RV32:        pli.h
+// RV64:        lui
+int16x2_t test_pmv_s_i16x2_imm(void) { return __riscv_pmv_s_i16x2(-5); }
+
+// CHECK-LABEL: test_pmv_s_u16x2_imm_hi:
+// RV32:        plui.h
+// RV64:        lui
+uint16x2_t test_pmv_s_u16x2_imm_hi(void) { return __riscv_pmv_s_u16x2(0x3600); }
+
+// CHECK-LABEL: test_pmv_s_i16x2_imm_hi:
+// RV32:        plui.h
+// RV64:        lui
+int16x2_t test_pmv_s_i16x2_imm_hi(void) { return __riscv_pmv_s_i16x2(0x3600); }
+
+// CHECK-LABEL: test_pmv_s_u8x8:
+// RV32:        pmv.dbs
+// RV64:        pmv.bs
+uint8x8_t test_pmv_s_u8x8(uint8_t x) { return __riscv_pmv_s_u8x8(x); }
+
+// CHECK-LABEL: test_pmv_s_i8x8:
+// RV32:        pmv.dbs
+// RV64:        pmv.bs
+int8x8_t test_pmv_s_i8x8(int8_t x) { return __riscv_pmv_s_i8x8(x); }
+
+// CHECK-LABEL: test_pmv_s_u16x4:
+// RV32:        pmv.dhs
+// RV64:        pmv.hs
+uint16x4_t test_pmv_s_u16x4(uint16_t x) { return __riscv_pmv_s_u16x4(x); }
+
+// CHECK-LABEL: test_pmv_s_i16x4:
+// RV32:        pmv.dhs
+// RV64:        pmv.hs
+int16x4_t test_pmv_s_i16x4(int16_t x) { return __riscv_pmv_s_i16x4(x); }
+
+// TODO: On RV32, the 32x2 variable splat emits a plain `mv` instead of
+// `padd.dws` with rs1_p=x0.
+// CHECK-LABEL: test_pmv_s_u32x2:
+// RV32:        mv{{[[:space:]]}}
+// RV64:        pmv.ws
+uint32x2_t test_pmv_s_u32x2(uint32_t x) { return __riscv_pmv_s_u32x2(x); }
+
+// CHECK-LABEL: test_pmv_s_i32x2:
+// RV32:        mv{{[[:space:]]}}
+// RV64:        pmv.ws
+int32x2_t test_pmv_s_i32x2(int32_t x) { return __riscv_pmv_s_i32x2(x); }
+
+// TODO: On RV32, the 64-bit packed constant splat emits two `pli.b`/`pli.h`/
+// `plui.h` instead of one `pli.db`/`pli.dh`/`plui.dh`.
+// CHECK-LABEL: test_pmv_s_u8x8_imm:
+// RV32-COUNT-2: pli.b
+// RV64:         pli.b
+uint8x8_t test_pmv_s_u8x8_imm(void) { return __riscv_pmv_s_u8x8(5); }
+
+// CHECK-LABEL: test_pmv_s_i8x8_imm:
+// RV32-COUNT-2: pli.b
+// RV64:         pli.b
+int8x8_t test_pmv_s_i8x8_imm(void) { return __riscv_pmv_s_i8x8(-3); }
+
+// CHECK-LABEL: test_pmv_s_u16x4_imm:
+// RV32-COUNT-2: pli.h
+// RV64:         pli.h
+uint16x4_t test_pmv_s_u16x4_imm(void) { return __riscv_pmv_s_u16x4(42); }
+
+// CHECK-LABEL: test_pmv_s_i16x4_imm:
+// RV32-COUNT-2: pli.h
+// RV64:         pli.h
+int16x4_t test_pmv_s_i16x4_imm(void) { return __riscv_pmv_s_i16x4(-5); }
+
+// CHECK-LABEL: test_pmv_s_u16x4_imm_hi:
+// RV32-COUNT-2: plui.h
+// RV64:         plui.h
+uint16x4_t test_pmv_s_u16x4_imm_hi(void) { return __riscv_pmv_s_u16x4(0x3600); }
+
+// CHECK-LABEL: test_pmv_s_i16x4_imm_hi:
+// RV32-COUNT-2: plui.h
+// RV64:         plui.h
+int16x4_t test_pmv_s_i16x4_imm_hi(void) { return __riscv_pmv_s_i16x4(0x3600); }
+
+// Note: Constants that fit `addi`'s 12-bit immediate fold to 2x `li`.
+// Larger constants follow `lui`+`addi`+`mv`; see `_imm_big` below.
+// CHECK-LABEL: test_pmv_s_u32x2_imm:
+// RV32-COUNT-2: li{{[[:space:]]}}
+// RV64:         pli.w
+uint32x2_t test_pmv_s_u32x2_imm(void) { return __riscv_pmv_s_u32x2(42); }
+
+// CHECK-LABEL: test_pmv_s_i32x2_imm:
+// RV32-COUNT-2: li{{[[:space:]]}}
+// RV64:         pli.w
+int32x2_t test_pmv_s_i32x2_imm(void) { return __riscv_pmv_s_i32x2(-5); }
+
+// CHECK-LABEL: test_pmv_s_u32x2_imm_big:
+// RV32:        lui
+// RV32-NEXT:   addi
+// RV32-NEXT:   mv{{[[:space:]]}}
+// RV32-NEXT:   ret
+uint32x2_t test_pmv_s_u32x2_imm_big(void) {
+  return __riscv_pmv_s_u32x2(0x12345);
+}
+
+// CHECK-LABEL: test_pmv_s_i32x2_imm_big:
+// RV32:        lui
+// RV32-NEXT:   addi
+// RV32-NEXT:   mv{{[[:space:]]}}
+// RV32-NEXT:   ret
+int32x2_t test_pmv_s_i32x2_imm_big(void) {
+  return __riscv_pmv_s_i32x2(0x12345);
+}
+
+// CHECK-LABEL: test_psll_s_u8x4:
+// CHECK:       psll.bs
+uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned n) {
+  return __riscv_psll_s_u8x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i8x4:
+// CHECK:       psll.bs
+int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned n) {
+  return __riscv_psll_s_i8x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u16x2:
+// CHECK:       psll.hs
+uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned n) {
+  return __riscv_psll_s_u16x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i16x2:
+// CHECK:       psll.hs
+int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned n) {
+  return __riscv_psll_s_i16x2(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x4:
+// CHECK:       psrl.bs
+uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned n) {
+  return __riscv_psrl_s_u8x4(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x2:
+// CHECK:       psrl.hs
+uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned n) {
+  return __riscv_psrl_s_u16x2(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i8x4:
+// CHECK:       psra.bs
+int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned n) {
+  return __riscv_psra_s_i8x4(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i16x2:
+// CHECK:       psra.hs
+int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned n) {
+  return __riscv_psra_s_i16x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u8x4_imm:
+// CHECK:       pslli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x4_t test_psll_s_u8x4_imm(uint8x4_t a) {
+  return __riscv_psll_s_u8x4(a, 2);
+}
+
+// CHECK-LABEL: test_psll_s_i8x4_imm:
+// CHECK:       pslli.b{{[[:space:]]+}}{{.*}}, 3
+int8x4_t test_psll_s_i8x4_imm(int8x4_t a) { return __riscv_psll_s_i8x4(a, 3); }
+
+// CHECK-LABEL: test_psll_s_u16x2_imm:
+// CHECK:       pslli.h{{[[:space:]]+}}{{.*}}, 5
+uint16x2_t test_psll_s_u16x2_imm(uint16x2_t a) {
+  return __riscv_psll_s_u16x2(a, 5);
+}
+
+// CHECK-LABEL: test_psll_s_i16x2_imm:
+// CHECK:       pslli.h{{[[:space:]]+}}{{.*}}, 7
+int16x2_t test_psll_s_i16x2_imm(int16x2_t a) {
+  return __riscv_psll_s_i16x2(a, 7);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x4_imm:
+// CHECK:       psrli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x4_t test_psrl_s_u8x4_imm(uint8x4_t a) {
+  return __riscv_psrl_s_u8x4(a, 2);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x2_imm:
+// CHECK:       psrli.h{{[[:space:]]+}}{{.*}}, 3
+uint16x2_t test_psrl_s_u16x2_imm(uint16x2_t a) {
+  return __riscv_psrl_s_u16x2(a, 3);
+}
+
+// CHECK-LABEL: test_psra_s_i8x4_imm:
+// CHECK:       psrai.b{{[[:space:]]+}}{{.*}}, 4
+int8x4_t test_psra_s_i8x4_imm(int8x4_t a) { return __riscv_psra_s_i8x4(a, 4); }
+
+// CHECK-LABEL: test_psra_s_i16x2_imm:
+// CHECK:       psrai.h{{[[:space:]]+}}{{.*}}, 5
+int16x2_t test_psra_s_i16x2_imm(int16x2_t a) {
+  return __riscv_psra_s_i16x2(a, 5);
+}
+
+// CHECK-LABEL: test_psll_s_u8x8:
+// RV32:        psll.dbs
+// RV64:        psll.bs
+uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned n) {
+  return __riscv_psll_s_u8x8(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i8x8:
+// RV32:        psll.dbs
+// RV64:        psll.bs
+int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned n) {
+  return __riscv_psll_s_i8x8(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u16x4:
+// RV32:        psll.dhs
+// RV64:        psll.hs
+uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned n) {
+  return __riscv_psll_s_u16x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i16x4:
+// RV32:        psll.dhs
+// RV64:        psll.hs
+int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned n) {
+  return __riscv_psll_s_i16x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u32x2:
+// RV32:        psll.dws
+// RV64:        psll.ws
+uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned n) {
+  return __riscv_psll_s_u32x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i32x2:
+// RV32:        psll.dws
+// RV64:        psll.ws
+int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned n) {
+  return __riscv_psll_s_i32x2(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x8:
+// RV32:        psrl.dbs
+// RV64:        psrl.bs
+uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned n) {
+  return __riscv_psrl_s_u8x8(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x4:
+// RV32:        psrl.dhs
+// RV64:        psrl.hs
+uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned n) {
+  return __riscv_psrl_s_u16x4(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u32x2:
+// RV32:        psrl.dws
+// RV64:        psrl.ws
+uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned n) {
+  return __riscv_psrl_s_u32x2(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i8x8:
+// RV32:        psra.dbs
+// RV64:        psra.bs
+int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned n) {
+  return __riscv_psra_s_i8x8(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i16x4:
+// RV32:        psra.dhs
+// RV64:        psra.hs
+int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned n) {
+  return __riscv_psra_s_i16x4(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i32x2:
+// RV32:        psra.dws
+// RV64:        psra.ws
+int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned n) {
+  return __riscv_psra_s_i32x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u8x8_imm:
+// RV32:        pslli.db{{[[:space:]]+}}{{.*}}, 2
+// RV64:        pslli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x8_t test_psll_s_u8x8_imm(uint8x8_t a) {
+  return __riscv_psll_s_u8x8(a, 2);
+}
+
+// CHECK-LABEL: test_psll_s_i8x8_imm:
+// RV32:        pslli.db{{[[:space:]]+}}{{.*}}, 3
+// RV64:        pslli.b{{[[:space:]]+}}{{.*}}, 3
+int8x8_t test_psll_s_i8x8_imm(int8x8_t a) { return __riscv_psll_s_i8x8(a, 3); }
+
+// CHECK-LABEL: test_psll_s_u16x4_imm:
+// RV32:        pslli.dh{{[[:space:]]+}}{{.*}}, 4
+// RV64:        pslli.h{{[[:space:]]+}}{{.*}}, 4
+uint16x4_t test_psll_s_u16x4_imm(uint16x4_t a) {
+  return __riscv_psll_s_u16x4(a, 4);
+}
+
+// CHECK-LABEL: test_psll_s_i16x4_imm:
+// RV32:        pslli.dh{{[[:space:]]+}}{{.*}}, 5
+// RV64:        pslli.h{{[[:space:]]+}}{{.*}}, 5
+int16x4_t test_psll_s_i16x4_imm(int16x4_t a) {
+  return __riscv_psll_s_i16x4(a, 5);
+}
+
+// CHECK-LABEL: test_psll_s_u32x2_imm:
+// RV32:        pslli.dw{{[[:space:]]+}}{{.*}}, 7
+// RV64:        pslli.w{{[[:space:]]+}}{{.*}}, 7
+uint32x2_t test_psll_s_u32x2_imm(uint32x2_t a) {
+  return __riscv_psll_s_u32x2(a, 7);
+}
+
+// CHECK-LABEL: test_psll_s_i32x2_imm:
+// RV32:        pslli.dw{{[[:space:]]+}}{{.*}}, 9
+// RV64:        pslli.w{{[[:space:]]+}}{{.*}}, 9
+int32x2_t test_psll_s_i32x2_imm(int32x2_t a) {
+  return __riscv_psll_s_i32x2(a, 9);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x8_imm:
+// RV32:        psrli.db{{[[:space:]]+}}{{.*}}, 2
+// RV64:        psrli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x8_t test_psrl_s_u8x8_imm(uint8x8_t a) {
+  return __riscv_psrl_s_u8x8(a, 2);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x4_imm:
+// RV32:        psrli.dh{{[[:space:]]+}}{{.*}}, 3
+// RV64:        psrli.h{{[[:space:]]+}}{{.*}}, 3
+uint16x4_t test_psrl_s_u16x4_imm(uint16x4_t a) {
+  return __riscv_psrl_s_u16x4(a, 3);
+}
+
+// CHECK-LABEL: test_psrl_s_u32x2_imm:
+// RV32:        psrli.dw{{[[:space:]]+}}{{.*}}, 5
+// RV64:        psrli.w{{[[:space:]]+}}{{.*}}, 5
+uint32x2_t test_psrl_s_u32x2_imm(uint32x2_t a) {
+  return __riscv_psrl_s_u32x2(a, 5);
+}
+
+// CHECK-LABEL: test_psra_s_i8x8_imm:
+// RV32:        psrai.db{{[[:space:]]+}}{{.*}}, 4
+// RV64:        psrai.b{{[[:space:]]+}}{{.*}}, 4
+int8x8_t test_psra_s_i8x8_imm(int8x8_t a) { return __riscv_psra_s_i8x8(a, 4); }
+
+// CHECK-LABEL: test_psra_s_i16x4_imm:
+// RV32:        psrai.dh{{[[:space:]]+}}{{.*}}, 5
+// RV64:        psrai.h{{[[:space:]]+}}{{.*}}, 5
+int16x4_t test_psra_s_i16x4_imm(int16x4_t a) {
+  return __riscv_psra_s_i16x4(a, 5);
+}
+
+// CHECK-LABEL: test_psra_s_i32x2_imm:
+// RV32:        psrai.dw{{[[:space:]]+}}{{.*}}, 11
+// RV64:        psrai.w{{[[:space:]]+}}{{.*}}, 11
+int32x2_t test_psra_s_i32x2_imm(int32x2_t a) {
+  return __riscv_psra_s_i32x2(a, 11);
+}
+
+// CHECK-LABEL: test_padd_s_u8x4:
+// CHECK:       padd.bs
+uint8x4_t test_padd_s_u8x4(uint8x4_t a, uint8_t b) {
+  return __riscv_padd_s_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i8x4:
+// CHECK:       padd.bs
+int8x4_t test_padd_s_i8x4(int8x4_t a, int8_t b) {
+  return __riscv_padd_s_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u16x2:
+// CHECK:       padd.hs
+uint16x2_t test_padd_s_u16x2(uint16x2_t a, uint16_t b) {
+  return __riscv_padd_s_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i16x2:
+// CHECK:       padd.hs
+int16x2_t test_padd_s_i16x2(int16x2_t a, int16_t b) {
+  return __riscv_padd_s_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u8x8:
+// RV32:        padd.dbs
+// RV64:        padd.bs
+uint8x8_t test_padd_s_u8x8(uint8x8_t a, uint8_t b) {
+  return __riscv_padd_s_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i8x8:
+// RV32:        padd.dbs
+// RV64:        padd.bs
+int8x8_t test_padd_s_i8x8(int8x8_t a, int8_t b) {
+  return __riscv_padd_s_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u16x4:
+// RV32:        padd.dhs
+// RV64:        padd.hs
+uint16x4_t test_padd_s_u16x4(uint16x4_t a, uint16_t b) {
+  return __riscv_padd_s_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i16x4:
+// RV32:        padd.dhs
+// RV64:        padd.hs
+int16x4_t test_padd_s_i16x4(int16x4_t a, int16_t b) {
+  return __riscv_padd_s_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u32x2:
+// RV32:        padd.dws
+// RV64:        padd.ws
+uint32x2_t test_padd_s_u32x2(uint32x2_t a, uint32_t b) {
+  return __riscv_padd_s_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i32x2:
+// RV32:        padd.dws
+// RV64:        padd.ws
+int32x2_t test_padd_s_i32x2(int32x2_t a, int32_t b) {
+  return __riscv_padd_s_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_i8x4:
+// CHECK:       padd.b
+int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_padd_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_u8x4:
+// CHECK:       padd.b
+uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_padd_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_i16x2:
+// CHECK:       padd.h
+int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_padd_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_u16x2:
+// CHECK:       padd.h
+uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_padd_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_psub_i8x4:
+// CHECK:       psub.b
+int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_psub_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_psub_u8x4:
+// CHECK:       psub.b
+uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_psub_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_psub_i16x2:
+// CHECK:       psub.h
+int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psub_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_psub_u16x2:
+// CHECK:       psub.h
+uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psub_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pneg_i8x4:
+// CHECK:       pneg.b
+int8x4_t test_pneg_i8x4(int8x4_t a) { return __riscv_pneg_i8x4(a); }
+
+// CHECK-LABEL: test_pneg_i16x2:
+// CHECK:       pneg.h
+int16x2_t test_pneg_i16x2(int16x2_t a) { return __riscv_pneg_i16x2(a); }
+
+// CHECK-LABEL: test_padd_i8x8:
+// RV32:        padd.db
+// RV64:        padd.b
+int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_padd_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_u8x8:
+// RV32:        padd.db
+// RV64:        padd.b
+uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_padd_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_i16x4:
+// RV32:        padd.dh
+// RV64:        padd.h
+int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_padd_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_u16x4:
+// RV32:        padd.dh
+// RV64:        padd.h
+uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_padd_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_i32x2:
+// RV32:        padd.dw
+// RV64:        padd.w
+int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_padd_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_u32x2:
+// RV32:        padd.dw
+// RV64:        padd.w
+uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_padd_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_psub_i8x8:
+// RV32:        psub.db
+// RV64:        psub.b
+int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_psub_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_psub_u8x8:
+// RV32:        psub.db
+// RV64:        psub.b
+uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_psub_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_psub_i16x4:
+// RV32:        psub.dh
+// RV64:        psub.h
+int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psub_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_psub_u16x4:
+// RV32:        psub.dh
+// RV64:        psub.h
+uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psub_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_psub_i32x2:
+// RV32:        psub.dw
+// RV64:        psub.w
+int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psub_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_psub_u32x2:
+// RV32:        psub.dw
+// RV64:        psub.w
+uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psub_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pneg_i8x8:
+// RV32:        pneg.db
+// RV64:        pneg.b
+int8x8_t test_pneg_i8x8(int8x8_t a) { return __riscv_pneg_i8x8(a); }
+
+// CHECK-LABEL: test_pneg_i16x4:
+// RV32:        pneg.dh
+// RV64:        pneg.h
+int16x4_t test_pneg_i16x4(int16x4_t a) { return __riscv_pneg_i16x4(a); }
+
+// CHECK-LABEL: test_pneg_i32x2:
+// RV32:        pneg.dw
+// RV64:        pneg.w
+int32x2_t test_pneg_i32x2(int32x2_t a) { return __riscv_pneg_i32x2(a); }
+
+// CHECK-LABEL: test_pmin_i8x4:
+// CHECK:       pmin.b
+int8x4_t test_pmin_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pmin_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i16x2:
+// CHECK:       pmin.h
+int16x2_t test_pmin_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pmin_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u8x4:
+// CHECK:       pminu.b
+uint8x4_t test_pminu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pminu_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u16x2:
+// CHECK:       pminu.h
+uint16x2_t test_pminu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pminu_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i8x4:
+// CHECK:       pmax.b
+int8x4_t test_pmax_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pmax_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i16x2:
+// CHECK:       pmax.h
+int16x2_t test_pmax_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pmax_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u8x4:
+// CHECK:       pmaxu.b
+uint8x4_t test_pmaxu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pmaxu_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u16x2:
+// CHECK:       pmaxu.h
+uint16x2_t test_pmaxu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pmaxu_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i8x8:
+// RV32:        pmin.db
+// RV64:        pmin.b
+int8x8_t test_pmin_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pmin_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i16x4:
+// RV32:        pmin.dh
+// RV64:        pmin.h
+int16x4_t test_pmin_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pmin_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pmin_i32x2:
+// RV32:        pmin.dw
+// RV64:        pmin.w
+int32x2_t test_pmin_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pmin_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u8x8:
+// RV32:        pminu.db
+// RV64:        pminu.b
+uint8x8_t test_pminu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pminu_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u16x4:
+// RV32:        pminu.dh
+// RV64:        pminu.h
+uint16x4_t test_pminu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pminu_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pminu_u32x2:
+// RV32:        pminu.dw
+// RV64:        pminu.w
+uint32x2_t test_pminu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pminu_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i8x8:
+// RV32:        pmax.db
+// RV64:        pmax.b
+int8x8_t test_pmax_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pmax_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i16x4:
+// RV32:        pmax.dh
+// RV64:        pmax.h
+int16x4_t test_pmax_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pmax_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pmax_i32x2:
+// RV32:        pmax.dw
+// RV64:        pmax.w
+int32x2_t test_pmax_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pmax_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u8x8:
+// RV32:        pmaxu.db
+// RV64:        pmaxu.b
+uint8x8_t test_pmaxu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pmaxu_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u16x4:
+// RV32:        pmaxu.dh
+// RV64:        pmaxu.h
+uint16x4_t test_pmaxu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pmaxu_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pmaxu_u32x2:
+// RV32:        pmaxu.dw
+// RV64:        pmaxu.w
+uint32x2_t test_pmaxu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pmaxu_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pand_i8x4:
+// CHECK:       and{{[[:space:]]}}
+int8x4_t test_pand_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pand_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pand_u8x4:
+// CHECK:       and{{[[:space:]]}}
+uint8x4_t test_pand_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pand_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pand_i16x2:
+// CHECK:       and{{[[:space:]]}}
+int16x2_t test_pand_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pand_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pand_u16x2:
+// CHECK:       and{{[[:space:]]}}
+uint16x2_t test_pand_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pand_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_por_i8x4:
+// CHECK:       or{{[[:space:]]}}
+int8x4_t test_por_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_por_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_por_u8x4:
+// CHECK:       or{{[[:space:]]}}
+uint8x4_t test_por_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_por_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_por_i16x2:
+// CHECK:       or{{[[:space:]]}}
+int16x2_t test_por_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_por_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_por_u16x2:
+// CHECK:       or{{[[:space:]]}}
+uint16x2_t test_por_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_por_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i8x4:
+// CHECK:       xor{{[[:space:]]}}
+int8x4_t test_pxor_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pxor_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u8x4:
+// CHECK:       xor{{[[:space:]]}}
+uint8x4_t test_pxor_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pxor_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i16x2:
+// CHECK:       xor{{[[:space:]]}}
+int16x2_t test_pxor_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pxor_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u16x2:
+// CHECK:       xor{{[[:space:]]}}
+uint16x2_t test_pxor_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pxor_u16x2(a, b);
+}
+
+// TODO: On RV64, vector `NOT` on a 32-bit packed type emits `li`+`xor`
+// instead of the `not` alias.
+// CHECK-LABEL: test_pnot_i8x4:
+// RV32:        not{{[[:space:]]}}
+// RV64:        li
+// RV64-NEXT:   xor{{[[:space:]]}}
+int8x4_t test_pnot_i8x4(int8x4_t a) { return __riscv_pnot_i8x4(a); }
+
+// CHECK-LABEL: test_pnot_u8x4:
+// RV32:        not{{[[:space:]]}}
+// RV64:        li
+// RV64-NEXT:   xor{{[[:space:]]}}
+uint8x4_t test_pnot_u8x4(uint8x4_t a) { return __riscv_pnot_u8x4(a); }
+
+// CHECK-LABEL: test_pnot_i16x2:
+// RV32:        not{{[[:space:]]}}
+// RV64:        li
+// RV64-NEXT:   xor{{[[:space:]]}}
+int16x2_t test_pnot_i16x2(int16x2_t a) { return __riscv_pnot_i16x2(a); }
+
+// CHECK-LABEL: test_pnot_u16x2:
+// RV32:        not{{[[:space:]]}}
+// RV64:        li
+// RV64-NEXT:   xor{{[[:space:]]}}
+uint16x2_t test_pnot_u16x2(uint16x2_t a) { return __riscv_pnot_u16x2(a); }
+
+// CHECK-LABEL: test_pand_i8x8:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64:         and{{[[:space:]]}}
+int8x8_t test_pand_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pand_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pand_u8x8:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64:         and{{[[:space:]]}}
+uint8x8_t test_pand_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pand_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pand_i16x4:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64:         and{{[[:space:]]}}
+int16x4_t test_pand_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pand_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pand_u16x4:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64:         and{{[[:space:]]}}
+uint16x4_t test_pand_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pand_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pand_i32x2:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64:         and{{[[:space:]]}}
+int32x2_t test_pand_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pand_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pand_u32x2:
+// RV32-COUNT-2: and{{[[:space:]]}}
+// RV64:         and{{[[:space:]]}}
+uint32x2_t test_pand_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pand_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_por_i8x8:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64:         or{{[[:space:]]}}
+int8x8_t test_por_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_por_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_por_u8x8:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64:         or{{[[:space:]]}}
+uint8x8_t test_por_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_por_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_por_i16x4:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64:         or{{[[:space:]]}}
+int16x4_t test_por_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_por_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_por_u16x4:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64:         or{{[[:space:]]}}
+uint16x4_t test_por_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_por_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_por_i32x2:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64:         or{{[[:space:]]}}
+int32x2_t test_por_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_por_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_por_u32x2:
+// RV32-COUNT-2: or{{[[:space:]]}}
+// RV64:         or{{[[:space:]]}}
+uint32x2_t test_por_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_por_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i8x8:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64:         xor{{[[:space:]]}}
+int8x8_t test_pxor_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pxor_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u8x8:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64:         xor{{[[:space:]]}}
+uint8x8_t test_pxor_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pxor_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i16x4:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64:         xor{{[[:space:]]}}
+int16x4_t test_pxor_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pxor_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u16x4:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64:         xor{{[[:space:]]}}
+uint16x4_t test_pxor_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pxor_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pxor_i32x2:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64:         xor{{[[:space:]]}}
+int32x2_t test_pxor_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pxor_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pxor_u32x2:
+// RV32-COUNT-2: xor{{[[:space:]]}}
+// RV64:         xor{{[[:space:]]}}
+uint32x2_t test_pxor_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pxor_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pnot_i8x8:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64:         not{{[[:space:]]}}
+int8x8_t test_pnot_i8x8(int8x8_t a) { return __riscv_pnot_i8x8(a); }
+
+// CHECK-LABEL: test_pnot_u8x8:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64:         not{{[[:space:]]}}
+uint8x8_t test_pnot_u8x8(uint8x8_t a) { return __riscv_pnot_u8x8(a); }
+
+// CHECK-LABEL: test_pnot_i16x4:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64:         not{{[[:space:]]}}
+int16x4_t test_pnot_i16x4(int16x4_t a) { return __riscv_pnot_i16x4(a); }
+
+// CHECK-LABEL: test_pnot_u16x4:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64:         not{{[[:space:]]}}
+uint16x4_t test_pnot_u16x4(uint16x4_t a) { return __riscv_pnot_u16x4(a); }
+
+// CHECK-LABEL: test_pnot_i32x2:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64:         not{{[[:space:]]}}
+int32x2_t test_pnot_i32x2(int32x2_t a) { return __riscv_pnot_i32x2(a); }
+
+// CHECK-LABEL: test_pnot_u32x2:
+// RV32-COUNT-2: not{{[[:space:]]}}
+// RV64:         not{{[[:space:]]}}
+uint32x2_t test_pnot_u32x2(uint32x2_t a) { return __riscv_pnot_u32x2(a); }

>From 0d38de0290789cf3d281171e2675b4e6d42947e4 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Thu, 21 May 2026 17:23:38 +0800
Subject: [PATCH 10/19] drop __aligned__

---
 clang/lib/Headers/riscv_packed.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/clang/lib/Headers/riscv_packed.h b/clang/lib/Headers/riscv_packed.h
index c7605de340faa..6805a94d941d7 100644
--- a/clang/lib/Headers/riscv_packed.h
+++ b/clang/lib/Headers/riscv_packed.h
@@ -18,17 +18,17 @@ extern "C" {
 
 /* Packed SIMD Types */
 
-typedef int8_t int8x4_t __attribute__((__vector_size__(4), __aligned__(4)));
-typedef uint8_t uint8x4_t __attribute__((__vector_size__(4), __aligned__(4)));
-typedef int16_t int16x2_t __attribute__((__vector_size__(4), __aligned__(4)));
-typedef uint16_t uint16x2_t __attribute__((__vector_size__(4), __aligned__(4)));
-
-typedef int8_t int8x8_t __attribute__((__vector_size__(8), __aligned__(8)));
-typedef uint8_t uint8x8_t __attribute__((__vector_size__(8), __aligned__(8)));
-typedef int16_t int16x4_t __attribute__((__vector_size__(8), __aligned__(8)));
-typedef uint16_t uint16x4_t __attribute__((__vector_size__(8), __aligned__(8)));
-typedef int32_t int32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
-typedef uint32_t uint32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
+typedef int8_t int8x4_t __attribute__((__vector_size__(4)));
+typedef uint8_t uint8x4_t __attribute__((__vector_size__(4)));
+typedef int16_t int16x2_t __attribute__((__vector_size__(4)));
+typedef uint16_t uint16x2_t __attribute__((__vector_size__(4)));
+
+typedef int8_t int8x8_t __attribute__((__vector_size__(8)));
+typedef uint8_t uint8x8_t __attribute__((__vector_size__(8)));
+typedef int16_t int16x4_t __attribute__((__vector_size__(8)));
+typedef uint16_t uint16x4_t __attribute__((__vector_size__(8)));
+typedef int32_t int32x2_t __attribute__((__vector_size__(8)));
+typedef uint32_t uint32x2_t __attribute__((__vector_size__(8)));
 
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
 

>From a8d42aa15411a9959bd42d6a57a13ff0dfeec155 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Wed, 27 May 2026 10:22:03 +0800
Subject: [PATCH 11/19] RV64 32-bit pnot now emits not

---
 .../intrinsic-header-tests/riscv_packed.c      | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed.c b/cross-project-tests/intrinsic-header-tests/riscv_packed.c
index ba4973620c2c5..1afbb3eac007d 100644
--- a/cross-project-tests/intrinsic-header-tests/riscv_packed.c
+++ b/cross-project-tests/intrinsic-header-tests/riscv_packed.c
@@ -833,30 +833,20 @@ uint16x2_t test_pxor_u16x2(uint16x2_t a, uint16x2_t b) {
   return __riscv_pxor_u16x2(a, b);
 }
 
-// TODO: On RV64, vector `NOT` on a 32-bit packed type emits `li`+`xor`
-// instead of the `not` alias.
 // CHECK-LABEL: test_pnot_i8x4:
-// RV32:        not{{[[:space:]]}}
-// RV64:        li
-// RV64-NEXT:   xor{{[[:space:]]}}
+// CHECK:       not{{[[:space:]]}}
 int8x4_t test_pnot_i8x4(int8x4_t a) { return __riscv_pnot_i8x4(a); }
 
 // CHECK-LABEL: test_pnot_u8x4:
-// RV32:        not{{[[:space:]]}}
-// RV64:        li
-// RV64-NEXT:   xor{{[[:space:]]}}
+// CHECK:       not{{[[:space:]]}}
 uint8x4_t test_pnot_u8x4(uint8x4_t a) { return __riscv_pnot_u8x4(a); }
 
 // CHECK-LABEL: test_pnot_i16x2:
-// RV32:        not{{[[:space:]]}}
-// RV64:        li
-// RV64-NEXT:   xor{{[[:space:]]}}
+// CHECK:       not{{[[:space:]]}}
 int16x2_t test_pnot_i16x2(int16x2_t a) { return __riscv_pnot_i16x2(a); }
 
 // CHECK-LABEL: test_pnot_u16x2:
-// RV32:        not{{[[:space:]]}}
-// RV64:        li
-// RV64-NEXT:   xor{{[[:space:]]}}
+// CHECK:       not{{[[:space:]]}}
 uint16x2_t test_pnot_u16x2(uint16x2_t a) { return __riscv_pnot_u16x2(a); }
 
 // CHECK-LABEL: test_pand_i8x8:

>From 3d20ce00633d89113d8db2d3f148fb40b3b54da9 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Wed, 27 May 2026 19:42:04 +0800
Subject: [PATCH 12/19] add -verify -Wextra to RUN lines

---
 cross-project-tests/intrinsic-header-tests/riscv_packed.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed.c b/cross-project-tests/intrinsic-header-tests/riscv_packed.c
index 1afbb3eac007d..6afc1b2dce869 100644
--- a/cross-project-tests/intrinsic-header-tests/riscv_packed.c
+++ b/cross-project-tests/intrinsic-header-tests/riscv_packed.c
@@ -2,10 +2,10 @@
 // expected-no-diagnostics
 
 // RUN: %clang %s -O2 -S -o - --target=riscv32 \
-// RUN:   -menable-experimental-extensions -march=rv32i_p0p21 -Werror \
+// RUN:   -menable-experimental-extensions -march=rv32i_p0p21 -Werror -Wextra -Xclang -verify \
 // RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
 // RUN: %clang %s -O2 -S -o - --target=riscv64 \
-// RUN:   -menable-experimental-extensions -march=rv64i_p0p21 -Werror \
+// RUN:   -menable-experimental-extensions -march=rv64i_p0p21 -Werror -Wextra -Xclang -verify \
 // RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
 
 #include <riscv_packed.h>

>From 163f936df5c21ad9c27b48198b2cf1067fb04e0b Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Fri, 29 May 2026 10:13:37 +0800
Subject: [PATCH 13/19] chore: trigger PR update


>From 043782a9475beea0317f514643914be7bb7a825a Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Tue, 2 Jun 2026 16:44:00 +0800
Subject: [PATCH 14/19] rename riscv_packed.h to riscv_packed_simd.h

---
 clang/lib/Headers/CMakeLists.txt                          | 2 +-
 clang/lib/Headers/{riscv_packed.h => riscv_packed_simd.h} | 8 ++++----
 clang/test/CodeGen/RISCV/rvp-intrinsics.c                 | 2 +-
 .../{riscv_packed.c => riscv_packed_simd.c}               | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)
 rename clang/lib/Headers/{riscv_packed.h => riscv_packed_simd.h} (98%)
 rename cross-project-tests/intrinsic-header-tests/{riscv_packed.c => riscv_packed_simd.c} (99%)

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 59cd039747ae6..439f2725168ba 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -140,7 +140,7 @@ set(riscv_files
   riscv_corev_alu.h
   riscv_mips.h
   riscv_nds.h
-  riscv_packed.h
+  riscv_packed_simd.h
   sifive_vector.h
   )
 
diff --git a/clang/lib/Headers/riscv_packed.h b/clang/lib/Headers/riscv_packed_simd.h
similarity index 98%
rename from clang/lib/Headers/riscv_packed.h
rename to clang/lib/Headers/riscv_packed_simd.h
index 6805a94d941d7..a25fb8a696f1d 100644
--- a/clang/lib/Headers/riscv_packed.h
+++ b/clang/lib/Headers/riscv_packed_simd.h
@@ -1,4 +1,4 @@
-/*===---- riscv_packed.h - RISC-V P intrinsics -----------------------------===
+/*===---- riscv_packed_simd.h - RISC-V P intrinsics ------------------------===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -7,8 +7,8 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef __RISCV_PACKED_H
-#define __RISCV_PACKED_H
+#ifndef __RISCV_PACKED_SIMD_H
+#define __RISCV_PACKED_SIMD_H
 
 #include <stdint.h>
 
@@ -246,4 +246,4 @@ __packed_unary_op(pnot_u32x2, uint32x2_t, ~)
 }
 #endif
 
-#endif /* __RISCV_PACKED_H */
+#endif /* __RISCV_PACKED_SIMD_H */
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index e79c98dfd93a5..c84eb6ac2e270 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -6,7 +6,7 @@
 // RUN:   -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=sroa,instcombine | FileCheck %s --check-prefix=RV64
 
-#include <riscv_packed.h>
+#include <riscv_packed_simd.h>
 
 /* Packed Splat (32-bit) */
 
diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed.c b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
similarity index 99%
rename from cross-project-tests/intrinsic-header-tests/riscv_packed.c
rename to cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
index 6afc1b2dce869..f5a31a900403a 100644
--- a/cross-project-tests/intrinsic-header-tests/riscv_packed.c
+++ b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
@@ -8,7 +8,7 @@
 // RUN:   -menable-experimental-extensions -march=rv64i_p0p21 -Werror -Wextra -Xclang -verify \
 // RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
 
-#include <riscv_packed.h>
+#include <riscv_packed_simd.h>
 
 // CHECK-LABEL: test_pmv_s_u8x4:
 // CHECK:       pmv.bs

>From 64ab5bfed70db7f26630ac7ed2e579399d1e900b Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Tue, 2 Jun 2026 17:59:13 +0800
Subject: [PATCH 15/19] [Clang][RISCV] packed saturating add/sub intrinsics

Add header wrappers for psadd/psaddu/pssub/pssubu (i8x4/i16x2/u8x4/u16x2
and the 64-bit i8x8/i16x4/i32x2/u8x8/u16x4/u32x2), mapping to
__builtin_elementwise_add_sat / sub_sat. Backend instructions, isel
patterns and rvp-simd-*.ll coverage already exist; this wires up the
intrinsic spec section and the cross-project mnemonic test.
---
 clang/lib/Headers/riscv_packed_simd.h         |  68 ++-
 clang/test/CodeGen/RISCV/rvp-intrinsics.c     | 444 ++++++++++++++++++
 .../riscv_packed_simd.c                       | 132 ++++++
 3 files changed, 622 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Headers/riscv_packed_simd.h b/clang/lib/Headers/riscv_packed_simd.h
index a25fb8a696f1d..0fc1de13c17cc 100644
--- a/clang/lib/Headers/riscv_packed_simd.h
+++ b/clang/lib/Headers/riscv_packed_simd.h
@@ -67,7 +67,7 @@ typedef uint32_t uint32x2_t __attribute__((__vector_size__(8)));
     return op __rs1;                                                           \
   }
 
-#define __packed_minmax(name, ty, builtin)                                     \
+#define __packed_binary_builtin(name, ty, builtin)                             \
   static __inline__ ty __DEFAULT_FN_ATTRS                                      \
   __riscv_##name(ty __rs1, ty __rs2) {                                         \
     return builtin(__rs1, __rs2);                                              \
@@ -160,29 +160,53 @@ __packed_unary_op(pneg_i8x8, int8x8_t, -)
 __packed_unary_op(pneg_i16x4, int16x4_t, -)
 __packed_unary_op(pneg_i32x2, int32x2_t, -)
 
+/* Packed Saturating Addition and Subtraction (32-bit) */
+__packed_binary_builtin(psadd_i8x4, int8x4_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psadd_i16x2, int16x2_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u8x4, uint8x4_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u16x2, uint16x2_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(pssub_i8x4, int8x4_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssub_i16x2, int16x2_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u8x4, uint8x4_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u16x2, uint16x2_t, __builtin_elementwise_sub_sat)
+
+/* Packed Saturating Addition and Subtraction (64-bit) */
+__packed_binary_builtin(psadd_i8x8, int8x8_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psadd_i16x4, int16x4_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psadd_i32x2, int32x2_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u8x8, uint8x8_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u16x4, uint16x4_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(psaddu_u32x2, uint32x2_t, __builtin_elementwise_add_sat)
+__packed_binary_builtin(pssub_i8x8, int8x8_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssub_i16x4, int16x4_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssub_i32x2, int32x2_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u8x8, uint8x8_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u16x4, uint16x4_t, __builtin_elementwise_sub_sat)
+__packed_binary_builtin(pssubu_u32x2, uint32x2_t, __builtin_elementwise_sub_sat)
+
 /* Packed Minimum and Maximum (32-bit) */
-__packed_minmax(pmin_i8x4, int8x4_t, __builtin_elementwise_min)
-__packed_minmax(pmin_i16x2, int16x2_t, __builtin_elementwise_min)
-__packed_minmax(pminu_u8x4, uint8x4_t, __builtin_elementwise_min)
-__packed_minmax(pminu_u16x2, uint16x2_t, __builtin_elementwise_min)
-__packed_minmax(pmax_i8x4, int8x4_t, __builtin_elementwise_max)
-__packed_minmax(pmax_i16x2, int16x2_t, __builtin_elementwise_max)
-__packed_minmax(pmaxu_u8x4, uint8x4_t, __builtin_elementwise_max)
-__packed_minmax(pmaxu_u16x2, uint16x2_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmin_i8x4, int8x4_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmin_i16x2, int16x2_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u8x4, uint8x4_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u16x2, uint16x2_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmax_i8x4, int8x4_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmax_i16x2, int16x2_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u8x4, uint8x4_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u16x2, uint16x2_t, __builtin_elementwise_max)
 
 /* Packed Minimum and Maximum (64-bit) */
-__packed_minmax(pmin_i8x8, int8x8_t, __builtin_elementwise_min)
-__packed_minmax(pmin_i16x4, int16x4_t, __builtin_elementwise_min)
-__packed_minmax(pmin_i32x2, int32x2_t, __builtin_elementwise_min)
-__packed_minmax(pminu_u8x8, uint8x8_t, __builtin_elementwise_min)
-__packed_minmax(pminu_u16x4, uint16x4_t, __builtin_elementwise_min)
-__packed_minmax(pminu_u32x2, uint32x2_t, __builtin_elementwise_min)
-__packed_minmax(pmax_i8x8, int8x8_t, __builtin_elementwise_max)
-__packed_minmax(pmax_i16x4, int16x4_t, __builtin_elementwise_max)
-__packed_minmax(pmax_i32x2, int32x2_t, __builtin_elementwise_max)
-__packed_minmax(pmaxu_u8x8, uint8x8_t, __builtin_elementwise_max)
-__packed_minmax(pmaxu_u16x4, uint16x4_t, __builtin_elementwise_max)
-__packed_minmax(pmaxu_u32x2, uint32x2_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmin_i8x8, int8x8_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmin_i16x4, int16x4_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmin_i32x2, int32x2_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u8x8, uint8x8_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u16x4, uint16x4_t, __builtin_elementwise_min)
+__packed_binary_builtin(pminu_u32x2, uint32x2_t, __builtin_elementwise_min)
+__packed_binary_builtin(pmax_i8x8, int8x8_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmax_i16x4, int16x4_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmax_i32x2, int32x2_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u8x8, uint8x8_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u16x4, uint16x4_t, __builtin_elementwise_max)
+__packed_binary_builtin(pmaxu_u32x2, uint32x2_t, __builtin_elementwise_max)
 
 /* Packed Logical Operations (32-bit) */
 __packed_binary_op(pand_i8x4, int8x4_t, &)
@@ -239,7 +263,7 @@ __packed_unary_op(pnot_u32x2, uint32x2_t, ~)
 #undef __packed_scalar_binary_op
 #undef __packed_binary_op
 #undef __packed_unary_op
-#undef __packed_minmax
+#undef __packed_binary_builtin
 #undef __DEFAULT_FN_ATTRS
 
 #if defined(__cplusplus)
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index c84eb6ac2e270..4f64b7dd34c55 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -1556,6 +1556,450 @@ int32x2_t test_pneg_i32x2(int32x2_t a) {
   return __riscv_pneg_i32x2(a);
 }
 
+/* Packed Saturating Addition and Subtraction (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_psadd_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psadd_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int8x4_t test_psadd_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_psadd_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psadd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psadd_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_psadd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psadd_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psaddu_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psaddu_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint8x4_t test_psaddu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_psaddu_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psaddu_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psaddu_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_psaddu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psaddu_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssub_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssub_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int8x4_t test_pssub_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pssub_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssub_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssub_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_pssub_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pssub_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssubu_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssubu_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint8x4_t test_pssubu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pssubu_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssubu_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssubu_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_pssubu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pssubu_u16x2(a, b);
+}
+
+/* Packed Saturating Addition and Subtraction (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_psadd_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psadd_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_psadd_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_psadd_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psadd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psadd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_psadd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psadd_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psadd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psadd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_psadd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psadd_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psaddu_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psaddu_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_psaddu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_psaddu_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psaddu_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psaddu_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_psaddu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psaddu_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psaddu_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psaddu_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_psaddu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psaddu_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssub_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssub_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_pssub_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pssub_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssub_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssub_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_pssub_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pssub_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssub_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssub_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_pssub_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pssub_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssubu_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssubu_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_pssubu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pssubu_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssubu_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssubu_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_pssubu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pssubu_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssubu_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssubu_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_pssubu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pssubu_u32x2(a, b);
+}
+
 /* Packed Minimum and Maximum (32-bit) */
 
 // RV32-LABEL: define dso_local i32 @test_pmin_i8x4(
diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
index f5a31a900403a..98de0ffa650b7 100644
--- a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
+++ b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
@@ -629,6 +629,138 @@ int16x4_t test_pneg_i16x4(int16x4_t a) { return __riscv_pneg_i16x4(a); }
 // RV64:        pneg.w
 int32x2_t test_pneg_i32x2(int32x2_t a) { return __riscv_pneg_i32x2(a); }
 
+// CHECK-LABEL: test_psadd_i8x4:
+// CHECK:       psadd.b
+int8x4_t test_psadd_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_psadd_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i16x2:
+// CHECK:       psadd.h
+int16x2_t test_psadd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psadd_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u8x4:
+// CHECK:       psaddu.b
+uint8x4_t test_psaddu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_psaddu_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u16x2:
+// CHECK:       psaddu.h
+uint16x2_t test_psaddu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psaddu_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i8x4:
+// CHECK:       pssub.b
+int8x4_t test_pssub_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pssub_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i16x2:
+// CHECK:       pssub.h
+int16x2_t test_pssub_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pssub_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u8x4:
+// CHECK:       pssubu.b
+uint8x4_t test_pssubu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pssubu_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u16x2:
+// CHECK:       pssubu.h
+uint16x2_t test_pssubu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pssubu_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i8x8:
+// RV32:        psadd.db
+// RV64:        psadd.b
+int8x8_t test_psadd_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_psadd_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i16x4:
+// RV32:        psadd.dh
+// RV64:        psadd.h
+int16x4_t test_psadd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psadd_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_psadd_i32x2:
+// RV32:        psadd.dw
+// RV64:        psadd.w
+int32x2_t test_psadd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psadd_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u8x8:
+// RV32:        psaddu.db
+// RV64:        psaddu.b
+uint8x8_t test_psaddu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_psaddu_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u16x4:
+// RV32:        psaddu.dh
+// RV64:        psaddu.h
+uint16x4_t test_psaddu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psaddu_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_psaddu_u32x2:
+// RV32:        psaddu.dw
+// RV64:        psaddu.w
+uint32x2_t test_psaddu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psaddu_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i8x8:
+// RV32:        pssub.db
+// RV64:        pssub.b
+int8x8_t test_pssub_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pssub_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i16x4:
+// RV32:        pssub.dh
+// RV64:        pssub.h
+int16x4_t test_pssub_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pssub_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pssub_i32x2:
+// RV32:        pssub.dw
+// RV64:        pssub.w
+int32x2_t test_pssub_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pssub_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u8x8:
+// RV32:        pssubu.db
+// RV64:        pssubu.b
+uint8x8_t test_pssubu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pssubu_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u16x4:
+// RV32:        pssubu.dh
+// RV64:        pssubu.h
+uint16x4_t test_pssubu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pssubu_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_pssubu_u32x2:
+// RV32:        pssubu.dw
+// RV64:        pssubu.w
+uint32x2_t test_pssubu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pssubu_u32x2(a, b);
+}
+
 // CHECK-LABEL: test_pmin_i8x4:
 // CHECK:       pmin.b
 int8x4_t test_pmin_i8x4(int8x4_t a, int8x4_t b) {

>From ff6d88e6c140d9dcca60dd93c09480859a41d5ca Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Wed, 3 Jun 2026 15:45:59 +0800
Subject: [PATCH 16/19] [Clang][RISCV] packed shift-add intrinsics

Add psh1add / pssh1sadd header wrappers and the cross-project test.
---
 clang/lib/Headers/riscv_packed_simd.h         |  32 +++
 clang/test/CodeGen/RISCV/rvp-intrinsics.c     | 220 ++++++++++++++++++
 .../riscv_packed_simd.c                       |  60 +++++
 3 files changed, 312 insertions(+)

diff --git a/clang/lib/Headers/riscv_packed_simd.h b/clang/lib/Headers/riscv_packed_simd.h
index 0fc1de13c17cc..e7c0e31daffce 100644
--- a/clang/lib/Headers/riscv_packed_simd.h
+++ b/clang/lib/Headers/riscv_packed_simd.h
@@ -73,6 +73,23 @@ typedef uint32_t uint32x2_t __attribute__((__vector_size__(8)));
     return builtin(__rs1, __rs2);                                              \
   }
 
+#define __packed_sh1add(name, ty)                                              \
+  static __inline__ ty __DEFAULT_FN_ATTRS                                      \
+  __riscv_##name(ty __rs1, ty __rs2) {                                         \
+    return (__rs1 << 1) + __rs2;                                               \
+  }
+
+/* TODO: switch to sadd_sat(__builtin_elementwise_shl_sat(a, 1), b) once a
+ * generic elementwise shl_sat builtin exists. sadd_sat(a, a) is equivalent
+ * for signed types and the backend's saturating_shl1 PatFrags matches both
+ * shapes. */
+#define __packed_sh1sadd(name, ty)                                             \
+  static __inline__ ty __DEFAULT_FN_ATTRS                                      \
+  __riscv_##name(ty __rs1, ty __rs2) {                                         \
+    return __builtin_elementwise_add_sat(                                      \
+        __builtin_elementwise_add_sat(__rs1, __rs1), __rs2);                   \
+  }
+
 /* Packed Splat (32-bit) */
 __packed_splat(pmv_s_u8x4, uint8x4_t, uint8_t, __packed_splat4)
 __packed_splat(pmv_s_i8x4, int8x4_t, int8_t, __packed_splat4)
@@ -184,6 +201,19 @@ __packed_binary_builtin(pssubu_u8x8, uint8x8_t, __builtin_elementwise_sub_sat)
 __packed_binary_builtin(pssubu_u16x4, uint16x4_t, __builtin_elementwise_sub_sat)
 __packed_binary_builtin(pssubu_u32x2, uint32x2_t, __builtin_elementwise_sub_sat)
 
+/* Packed Shift-Add (32-bit) */
+__packed_sh1add(psh1add_i16x2, int16x2_t)
+__packed_sh1add(psh1add_u16x2, uint16x2_t)
+__packed_sh1sadd(pssh1sadd_i16x2, int16x2_t)
+
+/* Packed Shift-Add (64-bit) */
+__packed_sh1add(psh1add_i16x4, int16x4_t)
+__packed_sh1add(psh1add_u16x4, uint16x4_t)
+__packed_sh1add(psh1add_i32x2, int32x2_t)
+__packed_sh1add(psh1add_u32x2, uint32x2_t)
+__packed_sh1sadd(pssh1sadd_i16x4, int16x4_t)
+__packed_sh1sadd(pssh1sadd_i32x2, int32x2_t)
+
 /* Packed Minimum and Maximum (32-bit) */
 __packed_binary_builtin(pmin_i8x4, int8x4_t, __builtin_elementwise_min)
 __packed_binary_builtin(pmin_i16x2, int16x2_t, __builtin_elementwise_min)
@@ -264,6 +294,8 @@ __packed_unary_op(pnot_u32x2, uint32x2_t, ~)
 #undef __packed_binary_op
 #undef __packed_unary_op
 #undef __packed_binary_builtin
+#undef __packed_sh1add
+#undef __packed_sh1sadd
 #undef __DEFAULT_FN_ATTRS
 
 #if defined(__cplusplus)
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index 4f64b7dd34c55..b7bd0458ca297 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -2000,6 +2000,226 @@ uint32x2_t test_pssubu_u32x2(uint32x2_t a, uint32x2_t b) {
   return __riscv_pssubu_u32x2(a, b);
 }
 
+/* Packed Shift-Add (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_psh1add_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psh1add_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_psh1add_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psh1add_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psh1add_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psh1add_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_psh1add_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psh1add_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssh1sadd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP0]])
+// RV32-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ELT_SAT_I]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT3_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssh1sadd_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP0]])
+// RV64-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ELT_SAT_I]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT3_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_pssh1sadd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pssh1sadd_i16x2(a, b);
+}
+
+/* Packed Shift-Add (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_psh1add_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psh1add_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_psh1add_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psh1add_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_psh1add_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psh1add_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_psh1add_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psh1add_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssh1sadd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP0]])
+// RV32-NEXT:    [[ELT_SAT3_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[ELT_SAT_I]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT3_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssh1sadd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP0]])
+// RV64-NEXT:    [[ELT_SAT3_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[ELT_SAT_I]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT3_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_pssh1sadd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pssh1sadd_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pssh1sadd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP0]])
+// RV32-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ELT_SAT_I]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT3_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_pssh1sadd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP0]])
+// RV64-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ELT_SAT_I]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT3_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_pssh1sadd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pssh1sadd_i32x2(a, b);
+}
+
 /* Packed Minimum and Maximum (32-bit) */
 
 // RV32-LABEL: define dso_local i32 @test_pmin_i8x4(
diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
index 98de0ffa650b7..6a01dcfa35219 100644
--- a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
+++ b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
@@ -761,6 +761,66 @@ uint32x2_t test_pssubu_u32x2(uint32x2_t a, uint32x2_t b) {
   return __riscv_pssubu_u32x2(a, b);
 }
 
+// CHECK-LABEL: test_psh1add_i16x2:
+// CHECK:       psh1add.h
+int16x2_t test_psh1add_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psh1add_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_u16x2:
+// CHECK:       psh1add.h
+uint16x2_t test_psh1add_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psh1add_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_pssh1sadd_i16x2:
+// CHECK:       pssh1sadd.h
+int16x2_t test_pssh1sadd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pssh1sadd_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_i16x4:
+// RV32:        psh1add.dh
+// RV64:        psh1add.h
+int16x4_t test_psh1add_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psh1add_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_u16x4:
+// RV32:        psh1add.dh
+// RV64:        psh1add.h
+uint16x4_t test_psh1add_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psh1add_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_i32x2:
+// RV32:        psh1add.dw
+// RV64:        psh1add.w
+int32x2_t test_psh1add_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psh1add_i32x2(a, b);
+}
+
+// CHECK-LABEL: test_psh1add_u32x2:
+// RV32:        psh1add.dw
+// RV64:        psh1add.w
+uint32x2_t test_psh1add_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psh1add_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_pssh1sadd_i16x4:
+// RV32:        pssh1sadd.dh
+// RV64:        pssh1sadd.h
+int16x4_t test_pssh1sadd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pssh1sadd_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_pssh1sadd_i32x2:
+// RV32:        pssh1sadd.dw
+// RV64:        pssh1sadd.w
+int32x2_t test_pssh1sadd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pssh1sadd_i32x2(a, b);
+}
+
 // CHECK-LABEL: test_pmin_i8x4:
 // CHECK:       pmin.b
 int8x4_t test_pmin_i8x4(int8x4_t a, int8x4_t b) {

>From 8e0291e1b199f1c80bd583155c971ab9a3dd3425 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Tue, 9 Jun 2026 08:20:31 +0800
Subject: [PATCH 17/19] reorder sections to match latest spec

---
 clang/lib/Headers/riscv_packed_simd.h         |   88 +-
 clang/test/CodeGen/RISCV/rvp-intrinsics.c     | 2778 ++++++++---------
 .../riscv_packed_simd.c                       |  644 ++--
 3 files changed, 1755 insertions(+), 1755 deletions(-)

diff --git a/clang/lib/Headers/riscv_packed_simd.h b/clang/lib/Headers/riscv_packed_simd.h
index e7c0e31daffce..1f4f33c5fafa1 100644
--- a/clang/lib/Headers/riscv_packed_simd.h
+++ b/clang/lib/Headers/riscv_packed_simd.h
@@ -104,50 +104,6 @@ __packed_splat(pmv_s_i16x4, int16x4_t, int16_t, __packed_splat4)
 __packed_splat(pmv_s_u32x2, uint32x2_t, uint32_t, __packed_splat2)
 __packed_splat(pmv_s_i32x2, int32x2_t, int32_t, __packed_splat2)
 
-/* Packed Shifts (32-bit) */
-__packed_shift8(psll_s_u8x4, uint8x4_t, <<)
-__packed_shift8(psll_s_i8x4, int8x4_t, <<)
-__packed_shift16(psll_s_u16x2, uint16x2_t, <<)
-__packed_shift16(psll_s_i16x2, int16x2_t, <<)
-__packed_shift8(psrl_s_u8x4, uint8x4_t, >>)
-__packed_shift16(psrl_s_u16x2, uint16x2_t, >>)
-__packed_shift8(psra_s_i8x4, int8x4_t, >>)
-__packed_shift16(psra_s_i16x2, int16x2_t, >>)
-
-/* Packed Shifts (64-bit) */
-__packed_shift8(psll_s_u8x8, uint8x8_t, <<)
-__packed_shift8(psll_s_i8x8, int8x8_t, <<)
-__packed_shift16(psll_s_u16x4, uint16x4_t, <<)
-__packed_shift16(psll_s_i16x4, int16x4_t, <<)
-__packed_shift32(psll_s_u32x2, uint32x2_t, <<)
-__packed_shift32(psll_s_i32x2, int32x2_t, <<)
-__packed_shift8(psrl_s_u8x8, uint8x8_t, >>)
-__packed_shift16(psrl_s_u16x4, uint16x4_t, >>)
-__packed_shift32(psrl_s_u32x2, uint32x2_t, >>)
-__packed_shift8(psra_s_i8x8, int8x8_t, >>)
-__packed_shift16(psra_s_i16x4, int16x4_t, >>)
-__packed_shift32(psra_s_i32x2, int32x2_t, >>)
-
-/* Packed Addition with Scalar (32-bit) */
-__packed_scalar_binary_op(padd_s_u8x4, uint8x4_t, uint8_t, +, __packed_splat4)
-__packed_scalar_binary_op(padd_s_i8x4, int8x4_t, int8_t, +, __packed_splat4)
-__packed_scalar_binary_op(padd_s_u16x2, uint16x2_t, uint16_t, +,
-                          __packed_splat2)
-__packed_scalar_binary_op(padd_s_i16x2, int16x2_t, int16_t, +,
-                          __packed_splat2)
-
-/* Packed Addition with Scalar (64-bit) */
-__packed_scalar_binary_op(padd_s_u8x8, uint8x8_t, uint8_t, +, __packed_splat8)
-__packed_scalar_binary_op(padd_s_i8x8, int8x8_t, int8_t, +, __packed_splat8)
-__packed_scalar_binary_op(padd_s_u16x4, uint16x4_t, uint16_t, +,
-                          __packed_splat4)
-__packed_scalar_binary_op(padd_s_i16x4, int16x4_t, int16_t, +,
-                          __packed_splat4)
-__packed_scalar_binary_op(padd_s_u32x2, uint32x2_t, uint32_t, +,
-                          __packed_splat2)
-__packed_scalar_binary_op(padd_s_i32x2, int32x2_t, int32_t, +,
-                          __packed_splat2)
-
 /* Packed Addition and Subtraction (32-bit) */
 __packed_binary_op(padd_i8x4, int8x4_t, +)
 __packed_binary_op(padd_u8x4, uint8x4_t, +)
@@ -177,6 +133,26 @@ __packed_unary_op(pneg_i8x8, int8x8_t, -)
 __packed_unary_op(pneg_i16x4, int16x4_t, -)
 __packed_unary_op(pneg_i32x2, int32x2_t, -)
 
+/* Packed Addition with Scalar (32-bit) */
+__packed_scalar_binary_op(padd_s_u8x4, uint8x4_t, uint8_t, +, __packed_splat4)
+__packed_scalar_binary_op(padd_s_i8x4, int8x4_t, int8_t, +, __packed_splat4)
+__packed_scalar_binary_op(padd_s_u16x2, uint16x2_t, uint16_t, +,
+                          __packed_splat2)
+__packed_scalar_binary_op(padd_s_i16x2, int16x2_t, int16_t, +,
+                          __packed_splat2)
+
+/* Packed Addition with Scalar (64-bit) */
+__packed_scalar_binary_op(padd_s_u8x8, uint8x8_t, uint8_t, +, __packed_splat8)
+__packed_scalar_binary_op(padd_s_i8x8, int8x8_t, int8_t, +, __packed_splat8)
+__packed_scalar_binary_op(padd_s_u16x4, uint16x4_t, uint16_t, +,
+                          __packed_splat4)
+__packed_scalar_binary_op(padd_s_i16x4, int16x4_t, int16_t, +,
+                          __packed_splat4)
+__packed_scalar_binary_op(padd_s_u32x2, uint32x2_t, uint32_t, +,
+                          __packed_splat2)
+__packed_scalar_binary_op(padd_s_i32x2, int32x2_t, int32_t, +,
+                          __packed_splat2)
+
 /* Packed Saturating Addition and Subtraction (32-bit) */
 __packed_binary_builtin(psadd_i8x4, int8x4_t, __builtin_elementwise_add_sat)
 __packed_binary_builtin(psadd_i16x2, int16x2_t, __builtin_elementwise_add_sat)
@@ -238,6 +214,30 @@ __packed_binary_builtin(pmaxu_u8x8, uint8x8_t, __builtin_elementwise_max)
 __packed_binary_builtin(pmaxu_u16x4, uint16x4_t, __builtin_elementwise_max)
 __packed_binary_builtin(pmaxu_u32x2, uint32x2_t, __builtin_elementwise_max)
 
+/* Packed Shifts (32-bit) */
+__packed_shift8(psll_s_u8x4, uint8x4_t, <<)
+__packed_shift8(psll_s_i8x4, int8x4_t, <<)
+__packed_shift16(psll_s_u16x2, uint16x2_t, <<)
+__packed_shift16(psll_s_i16x2, int16x2_t, <<)
+__packed_shift8(psrl_s_u8x4, uint8x4_t, >>)
+__packed_shift16(psrl_s_u16x2, uint16x2_t, >>)
+__packed_shift8(psra_s_i8x4, int8x4_t, >>)
+__packed_shift16(psra_s_i16x2, int16x2_t, >>)
+
+/* Packed Shifts (64-bit) */
+__packed_shift8(psll_s_u8x8, uint8x8_t, <<)
+__packed_shift8(psll_s_i8x8, int8x8_t, <<)
+__packed_shift16(psll_s_u16x4, uint16x4_t, <<)
+__packed_shift16(psll_s_i16x4, int16x4_t, <<)
+__packed_shift32(psll_s_u32x2, uint32x2_t, <<)
+__packed_shift32(psll_s_i32x2, int32x2_t, <<)
+__packed_shift8(psrl_s_u8x8, uint8x8_t, >>)
+__packed_shift16(psrl_s_u16x4, uint16x4_t, >>)
+__packed_shift32(psrl_s_u32x2, uint32x2_t, >>)
+__packed_shift8(psra_s_i8x8, int8x8_t, >>)
+__packed_shift16(psra_s_i16x4, int16x4_t, >>)
+__packed_shift32(psra_s_i32x2, int32x2_t, >>)
+
 /* Packed Logical Operations (32-bit) */
 __packed_binary_op(pand_i8x4, int8x4_t, &)
 __packed_binary_op(pand_u8x4, uint8x4_t, &)
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index b7bd0458ca297..73db0bee19def 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -212,600 +212,588 @@ int32x2_t test_pmv_s_i32x2(int32_t x) {
   return __riscv_pmv_s_i32x2(x);
 }
 
-/* Packed Shifts (32-bit) */
+/* Packed Addition and Subtraction (32-bit) */
 
-// RV32-LABEL: define dso_local i32 @test_psll_s_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psll_s_i8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
-int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
-  return __riscv_psll_s_i8x4(a, shamt);
+int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_padd_i8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psll_s_u8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psll_s_u8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
-  return __riscv_psll_s_u8x4(a, shamt);
+uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_padd_u8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psll_s_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psll_s_i16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
-int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
-  return __riscv_psll_s_i16x2(a, shamt);
+int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_padd_i16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psll_s_u16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psll_s_u16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
-  return __riscv_psll_s_u16x2(a, shamt);
+uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_padd_u16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psra_s_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psra_s_i8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
-int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
-  return __riscv_psra_s_i8x4(a, shamt);
+int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_psub_i8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psrl_s_u8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psrl_s_u8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
-  return __riscv_psrl_s_u8x4(a, shamt);
+uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_psub_u8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psra_s_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psra_s_i16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
-int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
-  return __riscv_psra_s_i16x2(a, shamt);
+int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psub_i16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psrl_s_u16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psrl_s_u16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
-  return __riscv_psrl_s_u16x2(a, shamt);
+uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psub_u16x2(a, b);
 }
 
-/* Packed Shifts (64-bit) */
+// RV32-LABEL: define dso_local i32 @test_pneg_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pneg_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int8x4_t test_pneg_i8x4(int8x4_t a) {
+  return __riscv_pneg_i8x4(a);
+}
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_i8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_pneg_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_pneg_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int16x2_t test_pneg_i16x2(int16x2_t a) {
+  return __riscv_pneg_i16x2(a);
+}
+
+/* Packed Addition and Subtraction (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_i8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
-  return __riscv_psll_s_i8x8(a, shamt);
+int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_padd_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_u8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
-  return __riscv_psll_s_u8x8(a, shamt);
+uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_padd_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
-  return __riscv_psll_s_i16x4(a, shamt);
+int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_padd_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_u16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
-  return __riscv_psll_s_u16x4(a, shamt);
+uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_padd_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
-  return __riscv_psll_s_i32x2(a, shamt);
+int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_padd_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_u32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned shamt) {
-  return __riscv_psll_s_u32x2(a, shamt);
+uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_padd_u32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psra_s_i8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
-  return __riscv_psra_s_i8x8(a, shamt);
+int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_psub_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psrl_s_u8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
-  return __riscv_psrl_s_u8x8(a, shamt);
+uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_psub_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psra_s_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
-  return __riscv_psra_s_i16x4(a, shamt);
+int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psub_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psrl_s_u16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP4]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
-// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP4]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
-  return __riscv_psrl_s_u16x4(a, shamt);
+uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psub_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psra_s_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
-  return __riscv_psra_s_i32x2(a, shamt);
+int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psub_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psrl_s_u32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
-// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned shamt) {
-  return __riscv_psrl_s_u32x2(a, shamt);
+uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psub_u32x2(a, b);
 }
 
-/* Packed Addition with Scalar (32-bit) */
-
-// RV32-LABEL: define dso_local i32 @test_padd_s_u8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pneg_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
-// RV32-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP1]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local i32 @test_padd_s_u8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pneg_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
-// RV64-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP1]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
-uint8x4_t test_padd_s_u8x4(uint8x4_t a, uint8_t b) {
-  return __riscv_padd_s_u8x4(a, b);
+int8x8_t test_pneg_i8x8(int8x8_t a) {
+  return __riscv_pneg_i8x8(a);
 }
 
-// RV32-LABEL: define dso_local i32 @test_padd_s_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pneg_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
-// RV32-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP1]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local i32 @test_padd_s_i8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pneg_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int16x4_t test_pneg_i16x4(int16x4_t a) {
+  return __riscv_pneg_i16x4(a);
+}
+
+// RV32-LABEL: define dso_local i64 @test_pneg_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_pneg_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_pneg_i32x2(int32x2_t a) {
+  return __riscv_pneg_i32x2(a);
+}
+
+/* Packed Addition with Scalar (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV64-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint8x4_t test_padd_s_u8x4(uint8x4_t a, uint8_t b) {
+  return __riscv_padd_s_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+// RV32-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i8> [[VECINIT_I]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[VECINIT4_I]], [[TMP0]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i32 @test_padd_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i8 noundef signext [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
@@ -1012,1656 +1000,1668 @@ int32x2_t test_padd_s_i32x2(int32x2_t a, int32_t b) {
   return __riscv_padd_s_i32x2(a, b);
 }
 
-/* Packed Addition and Subtraction (32-bit) */
+/* Packed Saturating Addition and Subtraction (32-bit) */
 
-// RV32-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV32-LABEL: define dso_local i32 @test_psadd_i8x4(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV64-LABEL: define dso_local i32 @test_psadd_i8x4(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
-  return __riscv_padd_i8x4(a, b);
+int8x4_t test_psadd_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_psadd_i8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV32-LABEL: define dso_local i32 @test_psadd_i16x2(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV64-LABEL: define dso_local i32 @test_psadd_i16x2(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
-  return __riscv_padd_u8x4(a, b);
+int16x2_t test_psadd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psadd_i16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV32-LABEL: define dso_local i32 @test_psaddu_u8x4(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV64-LABEL: define dso_local i32 @test_psaddu_u8x4(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
-  return __riscv_padd_i16x2(a, b);
+uint8x4_t test_psaddu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_psaddu_u8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV32-LABEL: define dso_local i32 @test_psaddu_u16x2(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV64-LABEL: define dso_local i32 @test_psaddu_u16x2(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
-  return __riscv_padd_u16x2(a, b);
+uint16x2_t test_psaddu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psaddu_u16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV32-LABEL: define dso_local i32 @test_pssub_i8x4(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV64-LABEL: define dso_local i32 @test_pssub_i8x4(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
-  return __riscv_psub_i8x4(a, b);
+int8x4_t test_pssub_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pssub_i8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV32-LABEL: define dso_local i32 @test_pssub_i16x2(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV64-LABEL: define dso_local i32 @test_pssub_i16x2(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
-  return __riscv_psub_u8x4(a, b);
+int16x2_t test_pssub_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pssub_i16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV32-LABEL: define dso_local i32 @test_pssubu_u8x4(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV64-LABEL: define dso_local i32 @test_pssubu_u8x4(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
-  return __riscv_psub_i16x2(a, b);
+uint8x4_t test_pssubu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pssubu_u8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV32-LABEL: define dso_local i32 @test_pssubu_u16x2(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV64-LABEL: define dso_local i32 @test_pssubu_u16x2(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
-  return __riscv_psub_u16x2(a, b);
+uint16x2_t test_pssubu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pssubu_u16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_pneg_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+/* Packed Saturating Addition and Subtraction (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_psadd_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> zeroinitializer, [[TMP0]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP1]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_pneg_i8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psadd_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> zeroinitializer, [[TMP0]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP1]]
-//
-int8x4_t test_pneg_i8x4(int8x4_t a) {
-  return __riscv_pneg_i8x4(a);
-}
-
-// RV32-LABEL: define dso_local i32 @test_pneg_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> zeroinitializer, [[TMP0]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP1]]
-//
-// RV64-LABEL: define dso_local i32 @test_pneg_i16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> zeroinitializer, [[TMP0]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP1]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-int16x2_t test_pneg_i16x2(int16x2_t a) {
-  return __riscv_pneg_i16x2(a);
+int8x8_t test_psadd_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_psadd_i8x8(a, b);
 }
 
-/* Packed Addition and Subtraction (64-bit) */
-
-// RV32-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV32-LABEL: define dso_local i64 @test_psadd_i16x4(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV64-LABEL: define dso_local i64 @test_psadd_i16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
-  return __riscv_padd_i8x8(a, b);
+int16x4_t test_psadd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psadd_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV32-LABEL: define dso_local i64 @test_psadd_i32x2(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV64-LABEL: define dso_local i64 @test_psadd_i32x2(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
-  return __riscv_padd_u8x8(a, b);
+int32x2_t test_psadd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psadd_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV32-LABEL: define dso_local i64 @test_psaddu_u8x8(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV64-LABEL: define dso_local i64 @test_psaddu_u8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
-  return __riscv_padd_i16x4(a, b);
+uint8x8_t test_psaddu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_psaddu_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV32-LABEL: define dso_local i64 @test_psaddu_u16x4(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV64-LABEL: define dso_local i64 @test_psaddu_u16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
-  return __riscv_padd_u16x4(a, b);
+uint16x4_t test_psaddu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psaddu_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV32-LABEL: define dso_local i64 @test_psaddu_u32x2(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV64-LABEL: define dso_local i64 @test_psaddu_u32x2(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
-  return __riscv_padd_i32x2(a, b);
+uint32x2_t test_psaddu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psaddu_u32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV32-LABEL: define dso_local i64 @test_pssub_i8x8(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV64-LABEL: define dso_local i64 @test_pssub_i8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
-  return __riscv_padd_u32x2(a, b);
+int8x8_t test_pssub_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pssub_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV32-LABEL: define dso_local i64 @test_pssub_i16x4(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV64-LABEL: define dso_local i64 @test_pssub_i16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
-  return __riscv_psub_i8x8(a, b);
+int16x4_t test_pssub_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pssub_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV32-LABEL: define dso_local i64 @test_pssub_i32x2(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV64-LABEL: define dso_local i64 @test_pssub_i32x2(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
-  return __riscv_psub_u8x8(a, b);
+int32x2_t test_pssub_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pssub_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV32-LABEL: define dso_local i64 @test_pssubu_u8x8(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV64-LABEL: define dso_local i64 @test_pssubu_u8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
-  return __riscv_psub_i16x4(a, b);
+uint8x8_t test_pssubu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pssubu_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV32-LABEL: define dso_local i64 @test_pssubu_u16x4(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV64-LABEL: define dso_local i64 @test_pssubu_u16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
-  return __riscv_psub_u16x4(a, b);
+uint16x4_t test_pssubu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pssubu_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV32-LABEL: define dso_local i64 @test_pssubu_u32x2(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV64-LABEL: define dso_local i64 @test_pssubu_u32x2(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
-  return __riscv_psub_i32x2(a, b);
+uint32x2_t test_pssubu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pssubu_u32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_u32x2(
+/* Packed Shift-Add (32-bit) */
+
+// RV32-LABEL: define dso_local i32 @test_psh1add_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psh1add_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_psh1add_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psh1add_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psh1add_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_psh1add_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+uint16x2_t test_psh1add_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psh1add_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_pssh1sadd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP0]])
+// RV32-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ELT_SAT_I]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT3_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i32 @test_pssh1sadd_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP0]])
+// RV64-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ELT_SAT_I]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT3_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP2]]
+//
+int16x2_t test_pssh1sadd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pssh1sadd_i16x2(a, b);
+}
+
+/* Packed Shift-Add (64-bit) */
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_psh1add_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psh1add_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psh1add_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_psh1add_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psh1add_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psh1add_i32x2(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV64-LABEL: define dso_local i64 @test_psh1add_i32x2(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
-  return __riscv_psub_u32x2(a, b);
+int32x2_t test_psh1add_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psh1add_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pneg_i8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psh1add_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[TMP0]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_pneg_i8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psh1add_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[TMP0]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-int8x8_t test_pneg_i8x8(int8x8_t a) {
-  return __riscv_pneg_i8x8(a);
+uint32x2_t test_psh1add_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psh1add_u32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pneg_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pssh1sadd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[TMP0]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP0]])
+// RV32-NEXT:    [[ELT_SAT3_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[ELT_SAT_I]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT3_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_pneg_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pssh1sadd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[TMP0]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP0]])
+// RV64-NEXT:    [[ELT_SAT3_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[ELT_SAT_I]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT3_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-int16x4_t test_pneg_i16x4(int16x4_t a) {
-  return __riscv_pneg_i16x4(a);
+int16x4_t test_pssh1sadd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pssh1sadd_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pneg_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_pssh1sadd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[TMP0]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP0]])
+// RV32-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ELT_SAT_I]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT3_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_pneg_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_pssh1sadd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[TMP0]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP0]])
+// RV64-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ELT_SAT_I]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT3_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
-int32x2_t test_pneg_i32x2(int32x2_t a) {
-  return __riscv_pneg_i32x2(a);
+int32x2_t test_pssh1sadd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pssh1sadd_i32x2(a, b);
 }
 
-/* Packed Saturating Addition and Subtraction (32-bit) */
+/* Packed Minimum and Maximum (32-bit) */
 
-// RV32-LABEL: define dso_local i32 @test_psadd_i8x4(
+// RV32-LABEL: define dso_local i32 @test_pmin_i8x4(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psadd_i8x4(
+// RV64-LABEL: define dso_local i32 @test_pmin_i8x4(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-int8x4_t test_psadd_i8x4(int8x4_t a, int8x4_t b) {
-  return __riscv_psadd_i8x4(a, b);
+int8x4_t test_pmin_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pmin_i8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psadd_i16x2(
+// RV32-LABEL: define dso_local i32 @test_pmin_i16x2(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psadd_i16x2(
+// RV64-LABEL: define dso_local i32 @test_pmin_i16x2(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-int16x2_t test_psadd_i16x2(int16x2_t a, int16x2_t b) {
-  return __riscv_psadd_i16x2(a, b);
+int16x2_t test_pmin_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pmin_i16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psaddu_u8x4(
+// RV32-LABEL: define dso_local i32 @test_pminu_u8x4(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psaddu_u8x4(
+// RV64-LABEL: define dso_local i32 @test_pminu_u8x4(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint8x4_t test_psaddu_u8x4(uint8x4_t a, uint8x4_t b) {
-  return __riscv_psaddu_u8x4(a, b);
+uint8x4_t test_pminu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pminu_u8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psaddu_u16x2(
+// RV32-LABEL: define dso_local i32 @test_pminu_u16x2(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_psaddu_u16x2(
+// RV64-LABEL: define dso_local i32 @test_pminu_u16x2(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint16x2_t test_psaddu_u16x2(uint16x2_t a, uint16x2_t b) {
-  return __riscv_psaddu_u16x2(a, b);
+uint16x2_t test_pminu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pminu_u16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_pssub_i8x4(
+// RV32-LABEL: define dso_local i32 @test_pmax_i8x4(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_pssub_i8x4(
+// RV64-LABEL: define dso_local i32 @test_pmax_i8x4(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-int8x4_t test_pssub_i8x4(int8x4_t a, int8x4_t b) {
-  return __riscv_pssub_i8x4(a, b);
+int8x4_t test_pmax_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_pmax_i8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_pssub_i16x2(
+// RV32-LABEL: define dso_local i32 @test_pmax_i16x2(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_pssub_i16x2(
+// RV64-LABEL: define dso_local i32 @test_pmax_i16x2(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-int16x2_t test_pssub_i16x2(int16x2_t a, int16x2_t b) {
-  return __riscv_pssub_i16x2(a, b);
+int16x2_t test_pmax_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_pmax_i16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_pssubu_u8x4(
+// RV32-LABEL: define dso_local i32 @test_pmaxu_u8x4(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_pssubu_u8x4(
+// RV64-LABEL: define dso_local i32 @test_pmaxu_u8x4(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint8x4_t test_pssubu_u8x4(uint8x4_t a, uint8x4_t b) {
-  return __riscv_pssubu_u8x4(a, b);
+uint8x4_t test_pmaxu_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_pmaxu_u8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_pssubu_u16x2(
+// RV32-LABEL: define dso_local i32 @test_pmaxu_u16x2(
 // RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
 // RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i32 @test_pssubu_u16x2(
+// RV64-LABEL: define dso_local i32 @test_pmaxu_u16x2(
 // RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT_I]] to i32
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
 // RV64-NEXT:    ret i32 [[TMP2]]
 //
-uint16x2_t test_pssubu_u16x2(uint16x2_t a, uint16x2_t b) {
-  return __riscv_pssubu_u16x2(a, b);
+uint16x2_t test_pmaxu_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_pmaxu_u16x2(a, b);
 }
 
-/* Packed Saturating Addition and Subtraction (64-bit) */
+/* Packed Minimum and Maximum (64-bit) */
 
-// RV32-LABEL: define dso_local i64 @test_psadd_i8x8(
+// RV32-LABEL: define dso_local i64 @test_pmin_i8x8(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psadd_i8x8(
+// RV64-LABEL: define dso_local i64 @test_pmin_i8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int8x8_t test_psadd_i8x8(int8x8_t a, int8x8_t b) {
-  return __riscv_psadd_i8x8(a, b);
+int8x8_t test_pmin_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pmin_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psadd_i16x4(
+// RV32-LABEL: define dso_local i64 @test_pmin_i16x4(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psadd_i16x4(
+// RV64-LABEL: define dso_local i64 @test_pmin_i16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int16x4_t test_psadd_i16x4(int16x4_t a, int16x4_t b) {
-  return __riscv_psadd_i16x4(a, b);
+int16x4_t test_pmin_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pmin_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psadd_i32x2(
+// RV32-LABEL: define dso_local i64 @test_pmin_i32x2(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psadd_i32x2(
+// RV64-LABEL: define dso_local i64 @test_pmin_i32x2(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int32x2_t test_psadd_i32x2(int32x2_t a, int32x2_t b) {
-  return __riscv_psadd_i32x2(a, b);
+int32x2_t test_pmin_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pmin_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psaddu_u8x8(
+// RV32-LABEL: define dso_local i64 @test_pminu_u8x8(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psaddu_u8x8(
+// RV64-LABEL: define dso_local i64 @test_pminu_u8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint8x8_t test_psaddu_u8x8(uint8x8_t a, uint8x8_t b) {
-  return __riscv_psaddu_u8x8(a, b);
+uint8x8_t test_pminu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pminu_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psaddu_u16x4(
+// RV32-LABEL: define dso_local i64 @test_pminu_u16x4(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psaddu_u16x4(
+// RV64-LABEL: define dso_local i64 @test_pminu_u16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint16x4_t test_psaddu_u16x4(uint16x4_t a, uint16x4_t b) {
-  return __riscv_psaddu_u16x4(a, b);
+uint16x4_t test_pminu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pminu_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psaddu_u32x2(
+// RV32-LABEL: define dso_local i64 @test_pminu_u32x2(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_psaddu_u32x2(
+// RV64-LABEL: define dso_local i64 @test_pminu_u32x2(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint32x2_t test_psaddu_u32x2(uint32x2_t a, uint32x2_t b) {
-  return __riscv_psaddu_u32x2(a, b);
+uint32x2_t test_pminu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pminu_u32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pssub_i8x8(
+// RV32-LABEL: define dso_local i64 @test_pmax_i8x8(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_pssub_i8x8(
+// RV64-LABEL: define dso_local i64 @test_pmax_i8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int8x8_t test_pssub_i8x8(int8x8_t a, int8x8_t b) {
-  return __riscv_pssub_i8x8(a, b);
+int8x8_t test_pmax_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_pmax_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pssub_i16x4(
+// RV32-LABEL: define dso_local i64 @test_pmax_i16x4(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.smax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_pssub_i16x4(
+// RV64-LABEL: define dso_local i64 @test_pmax_i16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.smax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int16x4_t test_pssub_i16x4(int16x4_t a, int16x4_t b) {
-  return __riscv_pssub_i16x4(a, b);
+int16x4_t test_pmax_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_pmax_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pssub_i32x2(
+// RV32-LABEL: define dso_local i64 @test_pmax_i32x2(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_pssub_i32x2(
+// RV64-LABEL: define dso_local i64 @test_pmax_i32x2(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-int32x2_t test_pssub_i32x2(int32x2_t a, int32x2_t b) {
-  return __riscv_pssub_i32x2(a, b);
+int32x2_t test_pmax_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_pmax_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pssubu_u8x8(
+// RV32-LABEL: define dso_local i64 @test_pmaxu_u8x8(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_pssubu_u8x8(
+// RV64-LABEL: define dso_local i64 @test_pmaxu_u8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint8x8_t test_pssubu_u8x8(uint8x8_t a, uint8x8_t b) {
-  return __riscv_pssubu_u8x8(a, b);
+uint8x8_t test_pmaxu_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_pmaxu_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pssubu_u16x4(
+// RV32-LABEL: define dso_local i64 @test_pmaxu_u16x4(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.umax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_pssubu_u16x4(
+// RV64-LABEL: define dso_local i64 @test_pmaxu_u16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.umax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint16x4_t test_pssubu_u16x4(uint16x4_t a, uint16x4_t b) {
-  return __riscv_pssubu_u16x4(a, b);
+uint16x4_t test_pmaxu_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_pmaxu_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pssubu_u32x2(
+// RV32-LABEL: define dso_local i64 @test_pmaxu_u32x2(
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
 // RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local i64 @test_pssubu_u32x2(
+// RV64-LABEL: define dso_local i64 @test_pmaxu_u32x2(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT_I]] to i64
+// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
 // RV64-NEXT:    ret i64 [[TMP2]]
 //
-uint32x2_t test_pssubu_u32x2(uint32x2_t a, uint32x2_t b) {
-  return __riscv_pssubu_u32x2(a, b);
+uint32x2_t test_pmaxu_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_pmaxu_u32x2(a, b);
 }
 
-/* Packed Shift-Add (32-bit) */
+/* Packed Shifts (32-bit) */
 
-// RV32-LABEL: define dso_local i32 @test_psh1add_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psll_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i32 @test_psh1add_i16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psll_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP2]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
-int16x2_t test_psh1add_i16x2(int16x2_t a, int16x2_t b) {
-  return __riscv_psh1add_i16x2(a, b);
+int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
+  return __riscv_psll_s_i8x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psh1add_u16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psll_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i32 @test_psh1add_u16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psll_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], splat (i16 1)
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[SHL_I]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP2]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
-uint16x2_t test_psh1add_u16x2(uint16x2_t a, uint16x2_t b) {
-  return __riscv_psh1add_u16x2(a, b);
+uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
+  return __riscv_psll_s_u8x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i32 @test_pssh1sadd_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psll_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP0]])
-// RV32-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ELT_SAT_I]], <2 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT3_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i32 @test_pssh1sadd_i16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psll_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP0]])
-// RV64-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ELT_SAT_I]], <2 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_SAT3_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP2]]
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
-int16x2_t test_pssh1sadd_i16x2(int16x2_t a, int16x2_t b) {
-  return __riscv_pssh1sadd_i16x2(a, b);
+int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
+  return __riscv_psll_s_i16x2(a, shamt);
 }
 
-/* Packed Shift-Add (64-bit) */
-
-// RV32-LABEL: define dso_local i64 @test_psh1add_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psll_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_psh1add_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psll_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
-int16x4_t test_psh1add_i16x4(int16x4_t a, int16x4_t b) {
-  return __riscv_psh1add_i16x4(a, b);
+uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
+  return __riscv_psll_s_u16x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psh1add_u16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psra_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_psh1add_u16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psra_s_i8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], splat (i16 1)
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[SHL_I]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
-uint16x4_t test_psh1add_u16x4(uint16x4_t a, uint16x4_t b) {
-  return __riscv_psh1add_u16x4(a, b);
+int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
+  return __riscv_psra_s_i8x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psh1add_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
-//
-// RV64-LABEL: define dso_local i64 @test_psh1add_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
-//
-int32x2_t test_psh1add_i32x2(int32x2_t a, int32x2_t b) {
-  return __riscv_psh1add_i32x2(a, b);
-}
-
-// RV32-LABEL: define dso_local i64 @test_psh1add_u32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
-//
-// RV64-LABEL: define dso_local i64 @test_psh1add_u32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], splat (i32 1)
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[SHL_I]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
-//
-uint32x2_t test_psh1add_u32x2(uint32x2_t a, uint32x2_t b) {
-  return __riscv_psh1add_u32x2(a, b);
-}
-
-// RV32-LABEL: define dso_local i64 @test_pssh1sadd_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP0]])
-// RV32-NEXT:    [[ELT_SAT3_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[ELT_SAT_I]], <4 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT3_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
-//
-// RV64-LABEL: define dso_local i64 @test_pssh1sadd_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP0]])
-// RV64-NEXT:    [[ELT_SAT3_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[ELT_SAT_I]], <4 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_SAT3_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
-//
-int16x4_t test_pssh1sadd_i16x4(int16x4_t a, int16x4_t b) {
-  return __riscv_pssh1sadd_i16x4(a, b);
-}
-
-// RV32-LABEL: define dso_local i64 @test_pssh1sadd_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP0]])
-// RV32-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ELT_SAT_I]], <2 x i32> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT3_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
-//
-// RV64-LABEL: define dso_local i64 @test_pssh1sadd_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ELT_SAT_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP0]])
-// RV64-NEXT:    [[ELT_SAT3_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[ELT_SAT_I]], <2 x i32> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_SAT3_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
-//
-int32x2_t test_pssh1sadd_i32x2(int32x2_t a, int32x2_t b) {
-  return __riscv_pssh1sadd_i32x2(a, b);
-}
-
-/* Packed Minimum and Maximum (32-bit) */
-
-// RV32-LABEL: define dso_local i32 @test_pmin_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
-//
-// RV64-LABEL: define dso_local i32 @test_pmin_i8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP2]]
-//
-int8x4_t test_pmin_i8x4(int8x4_t a, int8x4_t b) {
-  return __riscv_pmin_i8x4(a, b);
-}
-
-// RV32-LABEL: define dso_local i32 @test_pmin_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
-//
-// RV64-LABEL: define dso_local i32 @test_pmin_i16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP2]]
-//
-int16x2_t test_pmin_i16x2(int16x2_t a, int16x2_t b) {
-  return __riscv_pmin_i16x2(a, b);
-}
-
-// RV32-LABEL: define dso_local i32 @test_pminu_u8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i32 @test_pminu_u8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psrl_s_u8x4(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MIN_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP2]]
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
-uint8x4_t test_pminu_u8x4(uint8x4_t a, uint8x4_t b) {
-  return __riscv_pminu_u8x4(a, b);
+uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
+  return __riscv_psrl_s_u8x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i32 @test_pminu_u16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psra_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i32 @test_pminu_u16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psra_s_i16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MIN_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP2]]
-//
-uint16x2_t test_pminu_u16x2(uint16x2_t a, uint16x2_t b) {
-  return __riscv_pminu_u16x2(a, b);
-}
-
-// RV32-LABEL: define dso_local i32 @test_pmax_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
-//
-// RV64-LABEL: define dso_local i32 @test_pmax_i8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP2]]
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
-int8x4_t test_pmax_i8x4(int8x4_t a, int8x4_t b) {
-  return __riscv_pmax_i8x4(a, b);
+int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
+  return __riscv_psra_s_i16x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i32 @test_pmax_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i32 @test_pmax_i16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i32 @test_psrl_s_u16x2(
+// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP2]]
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    ret i32 [[TMP4]]
 //
-int16x2_t test_pmax_i16x2(int16x2_t a, int16x2_t b) {
-  return __riscv_pmax_i16x2(a, b);
+uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
+  return __riscv_psrl_s_u16x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i32 @test_pmaxu_u8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
-//
-// RV64-LABEL: define dso_local i32 @test_pmaxu_u8x4(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ELT_MAX_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP2]]
-//
-uint8x4_t test_pmaxu_u8x4(uint8x4_t a, uint8x4_t b) {
-  return __riscv_pmaxu_u8x4(a, b);
-}
+/* Packed Shifts (64-bit) */
 
-// RV32-LABEL: define dso_local i32 @test_pmaxu_u16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i32 @test_pmaxu_u16x2(
-// RV64-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ELT_MAX_I]] to i32
-// RV64-NEXT:    ret i32 [[TMP2]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
-uint16x2_t test_pmaxu_u16x2(uint16x2_t a, uint16x2_t b) {
-  return __riscv_pmaxu_u16x2(a, b);
+int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
+  return __riscv_psll_s_i8x8(a, shamt);
 }
 
-/* Packed Minimum and Maximum (64-bit) */
-
-// RV32-LABEL: define dso_local i64 @test_pmin_i8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_pmin_i8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
-int8x8_t test_pmin_i8x8(int8x8_t a, int8x8_t b) {
-  return __riscv_pmin_i8x8(a, b);
+uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
+  return __riscv_psll_s_u8x8(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pmin_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_pmin_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
-//
-int16x4_t test_pmin_i16x4(int16x4_t a, int16x4_t b) {
-  return __riscv_pmin_i16x4(a, b);
-}
-
-// RV32-LABEL: define dso_local i64 @test_pmin_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
-//
-// RV64-LABEL: define dso_local i64 @test_pmin_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
-int32x2_t test_pmin_i32x2(int32x2_t a, int32x2_t b) {
-  return __riscv_pmin_i32x2(a, b);
+int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
+  return __riscv_psll_s_i16x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pminu_u8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_pminu_u8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MIN_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
-uint8x8_t test_pminu_u8x8(uint8x8_t a, uint8x8_t b) {
-  return __riscv_pminu_u8x8(a, b);
+uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
+  return __riscv_psll_s_u16x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pminu_u16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local i64 @test_pminu_u16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MIN_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
-uint16x4_t test_pminu_u16x4(uint16x4_t a, uint16x4_t b) {
-  return __riscv_pminu_u16x4(a, b);
+int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
+  return __riscv_psll_s_i32x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pminu_u32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local i64 @test_pminu_u32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ELT_MIN_I:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MIN_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
-uint32x2_t test_pminu_u32x2(uint32x2_t a, uint32x2_t b) {
-  return __riscv_pminu_u32x2(a, b);
+uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned shamt) {
+  return __riscv_psll_s_u32x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pmax_i8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_pmax_i8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
-int8x8_t test_pmax_i8x8(int8x8_t a, int8x8_t b) {
-  return __riscv_pmax_i8x8(a, b);
+int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
+  return __riscv_psra_s_i8x8(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pmax_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.smax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_pmax_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.smax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
-int16x4_t test_pmax_i16x4(int16x4_t a, int16x4_t b) {
-  return __riscv_pmax_i16x4(a, b);
+uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
+  return __riscv_psrl_s_u8x8(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pmax_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_pmax_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
-int32x2_t test_pmax_i32x2(int32x2_t a, int32x2_t b) {
-  return __riscv_pmax_i32x2(a, b);
+int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
+  return __riscv_psra_s_i16x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pmaxu_u8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
-// RV64-LABEL: define dso_local i64 @test_pmaxu_u8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ELT_MAX_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
-uint8x8_t test_pmaxu_u8x8(uint8x8_t a, uint8x8_t b) {
-  return __riscv_pmaxu_u8x8(a, b);
+uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
+  return __riscv_psrl_s_u16x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pmaxu_u16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.umax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local i64 @test_pmaxu_u16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <4 x i16> @llvm.umax.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ELT_MAX_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
-uint16x4_t test_pmaxu_u16x4(uint16x4_t a, uint16x4_t b) {
-  return __riscv_pmaxu_u16x4(a, b);
+int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
+  return __riscv_psra_s_i32x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_pmaxu_u32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local i64 @test_pmaxu_u32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ELT_MAX_I:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ELT_MAX_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
-uint32x2_t test_pmaxu_u32x2(uint32x2_t a, uint32x2_t b) {
-  return __riscv_pmaxu_u32x2(a, b);
+uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned shamt) {
+  return __riscv_psrl_s_u32x2(a, shamt);
 }
 
 /* Packed Logical Operations (32-bit) */
diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
index 6a01dcfa35219..5e9afc3cd5f99 100644
--- a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
+++ b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
@@ -152,328 +152,6 @@ int32x2_t test_pmv_s_i32x2_imm_big(void) {
   return __riscv_pmv_s_i32x2(0x12345);
 }
 
-// CHECK-LABEL: test_psll_s_u8x4:
-// CHECK:       psll.bs
-uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned n) {
-  return __riscv_psll_s_u8x4(a, n);
-}
-
-// CHECK-LABEL: test_psll_s_i8x4:
-// CHECK:       psll.bs
-int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned n) {
-  return __riscv_psll_s_i8x4(a, n);
-}
-
-// CHECK-LABEL: test_psll_s_u16x2:
-// CHECK:       psll.hs
-uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned n) {
-  return __riscv_psll_s_u16x2(a, n);
-}
-
-// CHECK-LABEL: test_psll_s_i16x2:
-// CHECK:       psll.hs
-int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned n) {
-  return __riscv_psll_s_i16x2(a, n);
-}
-
-// CHECK-LABEL: test_psrl_s_u8x4:
-// CHECK:       psrl.bs
-uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned n) {
-  return __riscv_psrl_s_u8x4(a, n);
-}
-
-// CHECK-LABEL: test_psrl_s_u16x2:
-// CHECK:       psrl.hs
-uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned n) {
-  return __riscv_psrl_s_u16x2(a, n);
-}
-
-// CHECK-LABEL: test_psra_s_i8x4:
-// CHECK:       psra.bs
-int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned n) {
-  return __riscv_psra_s_i8x4(a, n);
-}
-
-// CHECK-LABEL: test_psra_s_i16x2:
-// CHECK:       psra.hs
-int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned n) {
-  return __riscv_psra_s_i16x2(a, n);
-}
-
-// CHECK-LABEL: test_psll_s_u8x4_imm:
-// CHECK:       pslli.b{{[[:space:]]+}}{{.*}}, 2
-uint8x4_t test_psll_s_u8x4_imm(uint8x4_t a) {
-  return __riscv_psll_s_u8x4(a, 2);
-}
-
-// CHECK-LABEL: test_psll_s_i8x4_imm:
-// CHECK:       pslli.b{{[[:space:]]+}}{{.*}}, 3
-int8x4_t test_psll_s_i8x4_imm(int8x4_t a) { return __riscv_psll_s_i8x4(a, 3); }
-
-// CHECK-LABEL: test_psll_s_u16x2_imm:
-// CHECK:       pslli.h{{[[:space:]]+}}{{.*}}, 5
-uint16x2_t test_psll_s_u16x2_imm(uint16x2_t a) {
-  return __riscv_psll_s_u16x2(a, 5);
-}
-
-// CHECK-LABEL: test_psll_s_i16x2_imm:
-// CHECK:       pslli.h{{[[:space:]]+}}{{.*}}, 7
-int16x2_t test_psll_s_i16x2_imm(int16x2_t a) {
-  return __riscv_psll_s_i16x2(a, 7);
-}
-
-// CHECK-LABEL: test_psrl_s_u8x4_imm:
-// CHECK:       psrli.b{{[[:space:]]+}}{{.*}}, 2
-uint8x4_t test_psrl_s_u8x4_imm(uint8x4_t a) {
-  return __riscv_psrl_s_u8x4(a, 2);
-}
-
-// CHECK-LABEL: test_psrl_s_u16x2_imm:
-// CHECK:       psrli.h{{[[:space:]]+}}{{.*}}, 3
-uint16x2_t test_psrl_s_u16x2_imm(uint16x2_t a) {
-  return __riscv_psrl_s_u16x2(a, 3);
-}
-
-// CHECK-LABEL: test_psra_s_i8x4_imm:
-// CHECK:       psrai.b{{[[:space:]]+}}{{.*}}, 4
-int8x4_t test_psra_s_i8x4_imm(int8x4_t a) { return __riscv_psra_s_i8x4(a, 4); }
-
-// CHECK-LABEL: test_psra_s_i16x2_imm:
-// CHECK:       psrai.h{{[[:space:]]+}}{{.*}}, 5
-int16x2_t test_psra_s_i16x2_imm(int16x2_t a) {
-  return __riscv_psra_s_i16x2(a, 5);
-}
-
-// CHECK-LABEL: test_psll_s_u8x8:
-// RV32:        psll.dbs
-// RV64:        psll.bs
-uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned n) {
-  return __riscv_psll_s_u8x8(a, n);
-}
-
-// CHECK-LABEL: test_psll_s_i8x8:
-// RV32:        psll.dbs
-// RV64:        psll.bs
-int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned n) {
-  return __riscv_psll_s_i8x8(a, n);
-}
-
-// CHECK-LABEL: test_psll_s_u16x4:
-// RV32:        psll.dhs
-// RV64:        psll.hs
-uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned n) {
-  return __riscv_psll_s_u16x4(a, n);
-}
-
-// CHECK-LABEL: test_psll_s_i16x4:
-// RV32:        psll.dhs
-// RV64:        psll.hs
-int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned n) {
-  return __riscv_psll_s_i16x4(a, n);
-}
-
-// CHECK-LABEL: test_psll_s_u32x2:
-// RV32:        psll.dws
-// RV64:        psll.ws
-uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned n) {
-  return __riscv_psll_s_u32x2(a, n);
-}
-
-// CHECK-LABEL: test_psll_s_i32x2:
-// RV32:        psll.dws
-// RV64:        psll.ws
-int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned n) {
-  return __riscv_psll_s_i32x2(a, n);
-}
-
-// CHECK-LABEL: test_psrl_s_u8x8:
-// RV32:        psrl.dbs
-// RV64:        psrl.bs
-uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned n) {
-  return __riscv_psrl_s_u8x8(a, n);
-}
-
-// CHECK-LABEL: test_psrl_s_u16x4:
-// RV32:        psrl.dhs
-// RV64:        psrl.hs
-uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned n) {
-  return __riscv_psrl_s_u16x4(a, n);
-}
-
-// CHECK-LABEL: test_psrl_s_u32x2:
-// RV32:        psrl.dws
-// RV64:        psrl.ws
-uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned n) {
-  return __riscv_psrl_s_u32x2(a, n);
-}
-
-// CHECK-LABEL: test_psra_s_i8x8:
-// RV32:        psra.dbs
-// RV64:        psra.bs
-int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned n) {
-  return __riscv_psra_s_i8x8(a, n);
-}
-
-// CHECK-LABEL: test_psra_s_i16x4:
-// RV32:        psra.dhs
-// RV64:        psra.hs
-int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned n) {
-  return __riscv_psra_s_i16x4(a, n);
-}
-
-// CHECK-LABEL: test_psra_s_i32x2:
-// RV32:        psra.dws
-// RV64:        psra.ws
-int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned n) {
-  return __riscv_psra_s_i32x2(a, n);
-}
-
-// CHECK-LABEL: test_psll_s_u8x8_imm:
-// RV32:        pslli.db{{[[:space:]]+}}{{.*}}, 2
-// RV64:        pslli.b{{[[:space:]]+}}{{.*}}, 2
-uint8x8_t test_psll_s_u8x8_imm(uint8x8_t a) {
-  return __riscv_psll_s_u8x8(a, 2);
-}
-
-// CHECK-LABEL: test_psll_s_i8x8_imm:
-// RV32:        pslli.db{{[[:space:]]+}}{{.*}}, 3
-// RV64:        pslli.b{{[[:space:]]+}}{{.*}}, 3
-int8x8_t test_psll_s_i8x8_imm(int8x8_t a) { return __riscv_psll_s_i8x8(a, 3); }
-
-// CHECK-LABEL: test_psll_s_u16x4_imm:
-// RV32:        pslli.dh{{[[:space:]]+}}{{.*}}, 4
-// RV64:        pslli.h{{[[:space:]]+}}{{.*}}, 4
-uint16x4_t test_psll_s_u16x4_imm(uint16x4_t a) {
-  return __riscv_psll_s_u16x4(a, 4);
-}
-
-// CHECK-LABEL: test_psll_s_i16x4_imm:
-// RV32:        pslli.dh{{[[:space:]]+}}{{.*}}, 5
-// RV64:        pslli.h{{[[:space:]]+}}{{.*}}, 5
-int16x4_t test_psll_s_i16x4_imm(int16x4_t a) {
-  return __riscv_psll_s_i16x4(a, 5);
-}
-
-// CHECK-LABEL: test_psll_s_u32x2_imm:
-// RV32:        pslli.dw{{[[:space:]]+}}{{.*}}, 7
-// RV64:        pslli.w{{[[:space:]]+}}{{.*}}, 7
-uint32x2_t test_psll_s_u32x2_imm(uint32x2_t a) {
-  return __riscv_psll_s_u32x2(a, 7);
-}
-
-// CHECK-LABEL: test_psll_s_i32x2_imm:
-// RV32:        pslli.dw{{[[:space:]]+}}{{.*}}, 9
-// RV64:        pslli.w{{[[:space:]]+}}{{.*}}, 9
-int32x2_t test_psll_s_i32x2_imm(int32x2_t a) {
-  return __riscv_psll_s_i32x2(a, 9);
-}
-
-// CHECK-LABEL: test_psrl_s_u8x8_imm:
-// RV32:        psrli.db{{[[:space:]]+}}{{.*}}, 2
-// RV64:        psrli.b{{[[:space:]]+}}{{.*}}, 2
-uint8x8_t test_psrl_s_u8x8_imm(uint8x8_t a) {
-  return __riscv_psrl_s_u8x8(a, 2);
-}
-
-// CHECK-LABEL: test_psrl_s_u16x4_imm:
-// RV32:        psrli.dh{{[[:space:]]+}}{{.*}}, 3
-// RV64:        psrli.h{{[[:space:]]+}}{{.*}}, 3
-uint16x4_t test_psrl_s_u16x4_imm(uint16x4_t a) {
-  return __riscv_psrl_s_u16x4(a, 3);
-}
-
-// CHECK-LABEL: test_psrl_s_u32x2_imm:
-// RV32:        psrli.dw{{[[:space:]]+}}{{.*}}, 5
-// RV64:        psrli.w{{[[:space:]]+}}{{.*}}, 5
-uint32x2_t test_psrl_s_u32x2_imm(uint32x2_t a) {
-  return __riscv_psrl_s_u32x2(a, 5);
-}
-
-// CHECK-LABEL: test_psra_s_i8x8_imm:
-// RV32:        psrai.db{{[[:space:]]+}}{{.*}}, 4
-// RV64:        psrai.b{{[[:space:]]+}}{{.*}}, 4
-int8x8_t test_psra_s_i8x8_imm(int8x8_t a) { return __riscv_psra_s_i8x8(a, 4); }
-
-// CHECK-LABEL: test_psra_s_i16x4_imm:
-// RV32:        psrai.dh{{[[:space:]]+}}{{.*}}, 5
-// RV64:        psrai.h{{[[:space:]]+}}{{.*}}, 5
-int16x4_t test_psra_s_i16x4_imm(int16x4_t a) {
-  return __riscv_psra_s_i16x4(a, 5);
-}
-
-// CHECK-LABEL: test_psra_s_i32x2_imm:
-// RV32:        psrai.dw{{[[:space:]]+}}{{.*}}, 11
-// RV64:        psrai.w{{[[:space:]]+}}{{.*}}, 11
-int32x2_t test_psra_s_i32x2_imm(int32x2_t a) {
-  return __riscv_psra_s_i32x2(a, 11);
-}
-
-// CHECK-LABEL: test_padd_s_u8x4:
-// CHECK:       padd.bs
-uint8x4_t test_padd_s_u8x4(uint8x4_t a, uint8_t b) {
-  return __riscv_padd_s_u8x4(a, b);
-}
-
-// CHECK-LABEL: test_padd_s_i8x4:
-// CHECK:       padd.bs
-int8x4_t test_padd_s_i8x4(int8x4_t a, int8_t b) {
-  return __riscv_padd_s_i8x4(a, b);
-}
-
-// CHECK-LABEL: test_padd_s_u16x2:
-// CHECK:       padd.hs
-uint16x2_t test_padd_s_u16x2(uint16x2_t a, uint16_t b) {
-  return __riscv_padd_s_u16x2(a, b);
-}
-
-// CHECK-LABEL: test_padd_s_i16x2:
-// CHECK:       padd.hs
-int16x2_t test_padd_s_i16x2(int16x2_t a, int16_t b) {
-  return __riscv_padd_s_i16x2(a, b);
-}
-
-// CHECK-LABEL: test_padd_s_u8x8:
-// RV32:        padd.dbs
-// RV64:        padd.bs
-uint8x8_t test_padd_s_u8x8(uint8x8_t a, uint8_t b) {
-  return __riscv_padd_s_u8x8(a, b);
-}
-
-// CHECK-LABEL: test_padd_s_i8x8:
-// RV32:        padd.dbs
-// RV64:        padd.bs
-int8x8_t test_padd_s_i8x8(int8x8_t a, int8_t b) {
-  return __riscv_padd_s_i8x8(a, b);
-}
-
-// CHECK-LABEL: test_padd_s_u16x4:
-// RV32:        padd.dhs
-// RV64:        padd.hs
-uint16x4_t test_padd_s_u16x4(uint16x4_t a, uint16_t b) {
-  return __riscv_padd_s_u16x4(a, b);
-}
-
-// CHECK-LABEL: test_padd_s_i16x4:
-// RV32:        padd.dhs
-// RV64:        padd.hs
-int16x4_t test_padd_s_i16x4(int16x4_t a, int16_t b) {
-  return __riscv_padd_s_i16x4(a, b);
-}
-
-// CHECK-LABEL: test_padd_s_u32x2:
-// RV32:        padd.dws
-// RV64:        padd.ws
-uint32x2_t test_padd_s_u32x2(uint32x2_t a, uint32_t b) {
-  return __riscv_padd_s_u32x2(a, b);
-}
-
-// CHECK-LABEL: test_padd_s_i32x2:
-// RV32:        padd.dws
-// RV64:        padd.ws
-int32x2_t test_padd_s_i32x2(int32x2_t a, int32_t b) {
-  return __riscv_padd_s_i32x2(a, b);
-}
-
 // CHECK-LABEL: test_padd_i8x4:
 // CHECK:       padd.b
 int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
@@ -629,6 +307,72 @@ int16x4_t test_pneg_i16x4(int16x4_t a) { return __riscv_pneg_i16x4(a); }
 // RV64:        pneg.w
 int32x2_t test_pneg_i32x2(int32x2_t a) { return __riscv_pneg_i32x2(a); }
 
+// CHECK-LABEL: test_padd_s_u8x4:
+// CHECK:       padd.bs
+uint8x4_t test_padd_s_u8x4(uint8x4_t a, uint8_t b) {
+  return __riscv_padd_s_u8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i8x4:
+// CHECK:       padd.bs
+int8x4_t test_padd_s_i8x4(int8x4_t a, int8_t b) {
+  return __riscv_padd_s_i8x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u16x2:
+// CHECK:       padd.hs
+uint16x2_t test_padd_s_u16x2(uint16x2_t a, uint16_t b) {
+  return __riscv_padd_s_u16x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i16x2:
+// CHECK:       padd.hs
+int16x2_t test_padd_s_i16x2(int16x2_t a, int16_t b) {
+  return __riscv_padd_s_i16x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u8x8:
+// RV32:        padd.dbs
+// RV64:        padd.bs
+uint8x8_t test_padd_s_u8x8(uint8x8_t a, uint8_t b) {
+  return __riscv_padd_s_u8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i8x8:
+// RV32:        padd.dbs
+// RV64:        padd.bs
+int8x8_t test_padd_s_i8x8(int8x8_t a, int8_t b) {
+  return __riscv_padd_s_i8x8(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u16x4:
+// RV32:        padd.dhs
+// RV64:        padd.hs
+uint16x4_t test_padd_s_u16x4(uint16x4_t a, uint16_t b) {
+  return __riscv_padd_s_u16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i16x4:
+// RV32:        padd.dhs
+// RV64:        padd.hs
+int16x4_t test_padd_s_i16x4(int16x4_t a, int16_t b) {
+  return __riscv_padd_s_i16x4(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_u32x2:
+// RV32:        padd.dws
+// RV64:        padd.ws
+uint32x2_t test_padd_s_u32x2(uint32x2_t a, uint32_t b) {
+  return __riscv_padd_s_u32x2(a, b);
+}
+
+// CHECK-LABEL: test_padd_s_i32x2:
+// RV32:        padd.dws
+// RV64:        padd.ws
+int32x2_t test_padd_s_i32x2(int32x2_t a, int32_t b) {
+  return __riscv_padd_s_i32x2(a, b);
+}
+
 // CHECK-LABEL: test_psadd_i8x4:
 // CHECK:       psadd.b
 int8x4_t test_psadd_i8x4(int8x4_t a, int8x4_t b) {
@@ -953,6 +697,262 @@ uint32x2_t test_pmaxu_u32x2(uint32x2_t a, uint32x2_t b) {
   return __riscv_pmaxu_u32x2(a, b);
 }
 
+// CHECK-LABEL: test_psll_s_u8x4:
+// CHECK:       psll.bs
+uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned n) {
+  return __riscv_psll_s_u8x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i8x4:
+// CHECK:       psll.bs
+int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned n) {
+  return __riscv_psll_s_i8x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u16x2:
+// CHECK:       psll.hs
+uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned n) {
+  return __riscv_psll_s_u16x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i16x2:
+// CHECK:       psll.hs
+int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned n) {
+  return __riscv_psll_s_i16x2(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x4:
+// CHECK:       psrl.bs
+uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned n) {
+  return __riscv_psrl_s_u8x4(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x2:
+// CHECK:       psrl.hs
+uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned n) {
+  return __riscv_psrl_s_u16x2(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i8x4:
+// CHECK:       psra.bs
+int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned n) {
+  return __riscv_psra_s_i8x4(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i16x2:
+// CHECK:       psra.hs
+int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned n) {
+  return __riscv_psra_s_i16x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u8x4_imm:
+// CHECK:       pslli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x4_t test_psll_s_u8x4_imm(uint8x4_t a) {
+  return __riscv_psll_s_u8x4(a, 2);
+}
+
+// CHECK-LABEL: test_psll_s_i8x4_imm:
+// CHECK:       pslli.b{{[[:space:]]+}}{{.*}}, 3
+int8x4_t test_psll_s_i8x4_imm(int8x4_t a) { return __riscv_psll_s_i8x4(a, 3); }
+
+// CHECK-LABEL: test_psll_s_u16x2_imm:
+// CHECK:       pslli.h{{[[:space:]]+}}{{.*}}, 5
+uint16x2_t test_psll_s_u16x2_imm(uint16x2_t a) {
+  return __riscv_psll_s_u16x2(a, 5);
+}
+
+// CHECK-LABEL: test_psll_s_i16x2_imm:
+// CHECK:       pslli.h{{[[:space:]]+}}{{.*}}, 7
+int16x2_t test_psll_s_i16x2_imm(int16x2_t a) {
+  return __riscv_psll_s_i16x2(a, 7);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x4_imm:
+// CHECK:       psrli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x4_t test_psrl_s_u8x4_imm(uint8x4_t a) {
+  return __riscv_psrl_s_u8x4(a, 2);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x2_imm:
+// CHECK:       psrli.h{{[[:space:]]+}}{{.*}}, 3
+uint16x2_t test_psrl_s_u16x2_imm(uint16x2_t a) {
+  return __riscv_psrl_s_u16x2(a, 3);
+}
+
+// CHECK-LABEL: test_psra_s_i8x4_imm:
+// CHECK:       psrai.b{{[[:space:]]+}}{{.*}}, 4
+int8x4_t test_psra_s_i8x4_imm(int8x4_t a) { return __riscv_psra_s_i8x4(a, 4); }
+
+// CHECK-LABEL: test_psra_s_i16x2_imm:
+// CHECK:       psrai.h{{[[:space:]]+}}{{.*}}, 5
+int16x2_t test_psra_s_i16x2_imm(int16x2_t a) {
+  return __riscv_psra_s_i16x2(a, 5);
+}
+
+// CHECK-LABEL: test_psll_s_u8x8:
+// RV32:        psll.dbs
+// RV64:        psll.bs
+uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned n) {
+  return __riscv_psll_s_u8x8(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i8x8:
+// RV32:        psll.dbs
+// RV64:        psll.bs
+int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned n) {
+  return __riscv_psll_s_i8x8(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u16x4:
+// RV32:        psll.dhs
+// RV64:        psll.hs
+uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned n) {
+  return __riscv_psll_s_u16x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i16x4:
+// RV32:        psll.dhs
+// RV64:        psll.hs
+int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned n) {
+  return __riscv_psll_s_i16x4(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u32x2:
+// RV32:        psll.dws
+// RV64:        psll.ws
+uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned n) {
+  return __riscv_psll_s_u32x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_i32x2:
+// RV32:        psll.dws
+// RV64:        psll.ws
+int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned n) {
+  return __riscv_psll_s_i32x2(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x8:
+// RV32:        psrl.dbs
+// RV64:        psrl.bs
+uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned n) {
+  return __riscv_psrl_s_u8x8(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x4:
+// RV32:        psrl.dhs
+// RV64:        psrl.hs
+uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned n) {
+  return __riscv_psrl_s_u16x4(a, n);
+}
+
+// CHECK-LABEL: test_psrl_s_u32x2:
+// RV32:        psrl.dws
+// RV64:        psrl.ws
+uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned n) {
+  return __riscv_psrl_s_u32x2(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i8x8:
+// RV32:        psra.dbs
+// RV64:        psra.bs
+int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned n) {
+  return __riscv_psra_s_i8x8(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i16x4:
+// RV32:        psra.dhs
+// RV64:        psra.hs
+int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned n) {
+  return __riscv_psra_s_i16x4(a, n);
+}
+
+// CHECK-LABEL: test_psra_s_i32x2:
+// RV32:        psra.dws
+// RV64:        psra.ws
+int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned n) {
+  return __riscv_psra_s_i32x2(a, n);
+}
+
+// CHECK-LABEL: test_psll_s_u8x8_imm:
+// RV32:        pslli.db{{[[:space:]]+}}{{.*}}, 2
+// RV64:        pslli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x8_t test_psll_s_u8x8_imm(uint8x8_t a) {
+  return __riscv_psll_s_u8x8(a, 2);
+}
+
+// CHECK-LABEL: test_psll_s_i8x8_imm:
+// RV32:        pslli.db{{[[:space:]]+}}{{.*}}, 3
+// RV64:        pslli.b{{[[:space:]]+}}{{.*}}, 3
+int8x8_t test_psll_s_i8x8_imm(int8x8_t a) { return __riscv_psll_s_i8x8(a, 3); }
+
+// CHECK-LABEL: test_psll_s_u16x4_imm:
+// RV32:        pslli.dh{{[[:space:]]+}}{{.*}}, 4
+// RV64:        pslli.h{{[[:space:]]+}}{{.*}}, 4
+uint16x4_t test_psll_s_u16x4_imm(uint16x4_t a) {
+  return __riscv_psll_s_u16x4(a, 4);
+}
+
+// CHECK-LABEL: test_psll_s_i16x4_imm:
+// RV32:        pslli.dh{{[[:space:]]+}}{{.*}}, 5
+// RV64:        pslli.h{{[[:space:]]+}}{{.*}}, 5
+int16x4_t test_psll_s_i16x4_imm(int16x4_t a) {
+  return __riscv_psll_s_i16x4(a, 5);
+}
+
+// CHECK-LABEL: test_psll_s_u32x2_imm:
+// RV32:        pslli.dw{{[[:space:]]+}}{{.*}}, 7
+// RV64:        pslli.w{{[[:space:]]+}}{{.*}}, 7
+uint32x2_t test_psll_s_u32x2_imm(uint32x2_t a) {
+  return __riscv_psll_s_u32x2(a, 7);
+}
+
+// CHECK-LABEL: test_psll_s_i32x2_imm:
+// RV32:        pslli.dw{{[[:space:]]+}}{{.*}}, 9
+// RV64:        pslli.w{{[[:space:]]+}}{{.*}}, 9
+int32x2_t test_psll_s_i32x2_imm(int32x2_t a) {
+  return __riscv_psll_s_i32x2(a, 9);
+}
+
+// CHECK-LABEL: test_psrl_s_u8x8_imm:
+// RV32:        psrli.db{{[[:space:]]+}}{{.*}}, 2
+// RV64:        psrli.b{{[[:space:]]+}}{{.*}}, 2
+uint8x8_t test_psrl_s_u8x8_imm(uint8x8_t a) {
+  return __riscv_psrl_s_u8x8(a, 2);
+}
+
+// CHECK-LABEL: test_psrl_s_u16x4_imm:
+// RV32:        psrli.dh{{[[:space:]]+}}{{.*}}, 3
+// RV64:        psrli.h{{[[:space:]]+}}{{.*}}, 3
+uint16x4_t test_psrl_s_u16x4_imm(uint16x4_t a) {
+  return __riscv_psrl_s_u16x4(a, 3);
+}
+
+// CHECK-LABEL: test_psrl_s_u32x2_imm:
+// RV32:        psrli.dw{{[[:space:]]+}}{{.*}}, 5
+// RV64:        psrli.w{{[[:space:]]+}}{{.*}}, 5
+uint32x2_t test_psrl_s_u32x2_imm(uint32x2_t a) {
+  return __riscv_psrl_s_u32x2(a, 5);
+}
+
+// CHECK-LABEL: test_psra_s_i8x8_imm:
+// RV32:        psrai.db{{[[:space:]]+}}{{.*}}, 4
+// RV64:        psrai.b{{[[:space:]]+}}{{.*}}, 4
+int8x8_t test_psra_s_i8x8_imm(int8x8_t a) { return __riscv_psra_s_i8x8(a, 4); }
+
+// CHECK-LABEL: test_psra_s_i16x4_imm:
+// RV32:        psrai.dh{{[[:space:]]+}}{{.*}}, 5
+// RV64:        psrai.h{{[[:space:]]+}}{{.*}}, 5
+int16x4_t test_psra_s_i16x4_imm(int16x4_t a) {
+  return __riscv_psra_s_i16x4(a, 5);
+}
+
+// CHECK-LABEL: test_psra_s_i32x2_imm:
+// RV32:        psrai.dw{{[[:space:]]+}}{{.*}}, 11
+// RV64:        psrai.w{{[[:space:]]+}}{{.*}}, 11
+int32x2_t test_psra_s_i32x2_imm(int32x2_t a) {
+  return __riscv_psra_s_i32x2(a, 11);
+}
+
 // CHECK-LABEL: test_pand_i8x4:
 // CHECK:       and{{[[:space:]]}}
 int8x4_t test_pand_i8x4(int8x4_t a, int8x4_t b) {

>From a3cb4957e99a9659b8e9643c1d0a55a95bc9d4a6 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Tue, 9 Jun 2026 08:31:25 +0800
Subject: [PATCH 18/19] [Clang][RISCV] disable clang-format on packed macro
 call block

---
 clang/lib/Headers/riscv_packed_simd.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/lib/Headers/riscv_packed_simd.h b/clang/lib/Headers/riscv_packed_simd.h
index 1f4f33c5fafa1..a4667445d1b67 100644
--- a/clang/lib/Headers/riscv_packed_simd.h
+++ b/clang/lib/Headers/riscv_packed_simd.h
@@ -90,6 +90,9 @@ typedef uint32_t uint32x2_t __attribute__((__vector_size__(8)));
         __builtin_elementwise_add_sat(__rs1, __rs1), __rs2);                   \
   }
 
+// clang-format off: macro call sites have no trailing semicolons, which
+// confuses clang-format into a deeply nested expression.
+
 /* Packed Splat (32-bit) */
 __packed_splat(pmv_s_u8x4, uint8x4_t, uint8_t, __packed_splat4)
 __packed_splat(pmv_s_i8x4, int8x4_t, int8_t, __packed_splat4)
@@ -282,6 +285,8 @@ __packed_unary_op(pnot_u16x4, uint16x4_t, ~)
 __packed_unary_op(pnot_i32x2, int32x2_t, ~)
 __packed_unary_op(pnot_u32x2, uint32x2_t, ~)
 
+// clang-format on
+
 #undef __packed_splat2
 #undef __packed_splat4
 #undef __packed_splat8

>From 737c111a7276805190e38fea86a11e8e4466c2dd Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Tue, 9 Jun 2026 08:54:04 +0800
Subject: [PATCH 19/19] clang-format

---
 clang/lib/Headers/riscv_packed_simd.h         | 22 ++++++++-----------
 .../riscv_packed_simd.c                       |  6 +++--
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/clang/lib/Headers/riscv_packed_simd.h b/clang/lib/Headers/riscv_packed_simd.h
index a4667445d1b67..828cb90f8034a 100644
--- a/clang/lib/Headers/riscv_packed_simd.h
+++ b/clang/lib/Headers/riscv_packed_simd.h
@@ -42,23 +42,22 @@ typedef uint32_t uint32x2_t __attribute__((__vector_size__(8)));
   }
 
 #define __packed_shift(name, ty, op, mask)                                     \
-  static __inline__ ty __DEFAULT_FN_ATTRS                                      \
-  __riscv_##name(ty __rs1, unsigned __rs2) {                                   \
-    return __rs1 op (__rs2 & (mask));                                          \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1,             \
+                                                         unsigned __rs2) {     \
+    return __rs1 op(__rs2 & (mask));                                           \
   }
 #define __packed_shift8(name, ty, op) __packed_shift(name, ty, op, 0x7)
 #define __packed_shift16(name, ty, op) __packed_shift(name, ty, op, 0xf)
 #define __packed_shift32(name, ty, op) __packed_shift(name, ty, op, 0x1f)
 
 #define __packed_scalar_binary_op(name, ty, scalar_ty, op, splat)              \
-  static __inline__ ty __DEFAULT_FN_ATTRS                                      \
-  __riscv_##name(ty __rs1, scalar_ty __rs2) {                                  \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1,             \
+                                                         scalar_ty __rs2) {    \
     return __rs1 op splat(ty, __rs2);                                          \
   }
 
 #define __packed_binary_op(name, ty, op)                                       \
-  static __inline__ ty __DEFAULT_FN_ATTRS                                      \
-  __riscv_##name(ty __rs1, ty __rs2) {                                         \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
     return __rs1 op __rs2;                                                     \
   }
 
@@ -68,14 +67,12 @@ typedef uint32_t uint32x2_t __attribute__((__vector_size__(8)));
   }
 
 #define __packed_binary_builtin(name, ty, builtin)                             \
-  static __inline__ ty __DEFAULT_FN_ATTRS                                      \
-  __riscv_##name(ty __rs1, ty __rs2) {                                         \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
     return builtin(__rs1, __rs2);                                              \
   }
 
 #define __packed_sh1add(name, ty)                                              \
-  static __inline__ ty __DEFAULT_FN_ATTRS                                      \
-  __riscv_##name(ty __rs1, ty __rs2) {                                         \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
     return (__rs1 << 1) + __rs2;                                               \
   }
 
@@ -84,8 +81,7 @@ typedef uint32_t uint32x2_t __attribute__((__vector_size__(8)));
  * for signed types and the backend's saturating_shl1 PatFrags matches both
  * shapes. */
 #define __packed_sh1sadd(name, ty)                                             \
-  static __inline__ ty __DEFAULT_FN_ATTRS                                      \
-  __riscv_##name(ty __rs1, ty __rs2) {                                         \
+  static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, ty __rs2) { \
     return __builtin_elementwise_add_sat(                                      \
         __builtin_elementwise_add_sat(__rs1, __rs1), __rs2);                   \
   }
diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
index 5e9afc3cd5f99..a2c4b83360207 100644
--- a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
+++ b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
@@ -2,10 +2,12 @@
 // expected-no-diagnostics
 
 // RUN: %clang %s -O2 -S -o - --target=riscv32 \
-// RUN:   -menable-experimental-extensions -march=rv32i_p0p21 -Werror -Wextra -Xclang -verify \
+// RUN:   -menable-experimental-extensions -march=rv32i_p0p21 \
+// RUN:   -Werror -Wextra -Xclang -verify \
 // RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
 // RUN: %clang %s -O2 -S -o - --target=riscv64 \
-// RUN:   -menable-experimental-extensions -march=rv64i_p0p21 -Werror -Wextra -Xclang -verify \
+// RUN:   -menable-experimental-extensions -march=rv64i_p0p21 \
+// RUN:   -Werror -Wextra -Xclang -verify \
 // RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
 
 #include <riscv_packed_simd.h>



More information about the cfe-commits mailing list