[clang] [RISCV] Add riscv_packed.h for P extension intrinsics (PR #181115)

via cfe-commits cfe-commits at lists.llvm.org
Thu Mar 12 04:47:03 PDT 2026


https://github.com/sihuan updated https://github.com/llvm/llvm-project/pull/181115

>From d35130e58e3233c5980ca12ff04ab92bb069b4d3 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Fri, 30 Jan 2026 00:41:13 +0800
Subject: [PATCH 1/8] [Clang][RISCV] Add riscv_simd.h for P extension
 intrinsics

This patch adds `riscv_simd.h`, introducing initial support for RISC-V P extension intrinsics.

The supported operations include:
- Packed addition and subtraction (padd, psub)
- Packed logic and arithmetic shifts (psll, psrl, psra)

These intrinsics are implemented using standard C operators to generate canonical LLVM IR (e.g., `add <4 x i8>`, `shl <2 x i16>`). The implementation relies on the RISC-V backend to correctly lower this IR to specific P extension instructions.
---
 clang/lib/Headers/CMakeLists.txt          |    1 +
 clang/lib/Headers/riscv_simd.h            |  245 +++++
 clang/test/CodeGen/RISCV/rvp-intrinsics.c | 1009 +++++++++++++++++++++
 3 files changed, 1255 insertions(+)
 create mode 100644 clang/lib/Headers/riscv_simd.h
 create mode 100644 clang/test/CodeGen/RISCV/rvp-intrinsics.c

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index c92b370b88d2d..76574d7a937e8 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -137,6 +137,7 @@ set(riscv_files
   sifive_vector.h
   andes_vector.h
   riscv_mips.h
+  riscv_simd.h
   )
 
 set(spirv_files
diff --git a/clang/lib/Headers/riscv_simd.h b/clang/lib/Headers/riscv_simd.h
new file mode 100644
index 0000000000000..262f35b483cbd
--- /dev/null
+++ b/clang/lib/Headers/riscv_simd.h
@@ -0,0 +1,245 @@
+/*===---- riscv_simd.h - RISC-V P intrinsics -----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_SIMD_H
+#define __RISCV_SIMD_H
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Packed SIMD Types */
+
+typedef int8_t int8x4_t __attribute__((vector_size(4)));
+typedef uint8_t uint8x4_t __attribute__((vector_size(4)));
+typedef int16_t int16x2_t __attribute__((vector_size(4)));
+typedef uint16_t uint16x2_t __attribute__((vector_size(4)));
+
+typedef int8_t int8x8_t __attribute__((vector_size(8)));
+typedef uint8_t uint8x8_t __attribute__((vector_size(8)));
+typedef int16_t int16x4_t __attribute__((vector_size(8)));
+typedef uint16_t uint16x4_t __attribute__((vector_size(8)));
+typedef int32_t int32x2_t __attribute__((vector_size(8)));
+typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
+
+/* Packed Addition and Subtraction (32-bit) */
+
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_i8x4(int8x4_t __rs1, int8x4_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_u8x4(uint8x4_t __rs1, uint8x4_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_i16x2(int16x2_t __rs1, int16x2_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_u16x2(uint16x2_t __rs1, uint16x2_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_i8x4(int8x4_t __rs1, int8x4_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_u8x4(uint8x4_t __rs1, uint8x4_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_i16x2(int16x2_t __rs1, int16x2_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_u16x2(uint16x2_t __rs1, uint16x2_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+/* Packed Addition and Subtraction (64-bit) */
+
+static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_i8x8(int8x8_t __rs1, int8x8_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_u8x8(uint8x8_t __rs1, uint8x8_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_i16x4(int16x4_t __rs1, int16x4_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_u16x4(uint16x4_t __rs1, uint16x4_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_i32x2(int32x2_t __rs1, int32x2_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_padd_u32x2(uint32x2_t __rs1, uint32x2_t __rs2) {
+  return __rs1 + __rs2;
+}
+
+static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_i8x8(int8x8_t __rs1, int8x8_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_u8x8(uint8x8_t __rs1, uint8x8_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_i16x4(int16x4_t __rs1, int16x4_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_u16x4(uint16x4_t __rs1, uint16x4_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_i32x2(int32x2_t __rs1, int32x2_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psub_u32x2(uint32x2_t __rs1, uint32x2_t __rs2) {
+  return __rs1 - __rs2;
+}
+
+/* Packed Shifts (32-bit) */
+
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_u8x4(uint8x4_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_i8x4(int8x4_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_u16x2(uint16x2_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_i16x2(int16x2_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psrl_s_u8x4(uint8x4_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psrl_s_u16x2(uint16x2_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psra_s_i8x4(int8x4_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psra_s_i16x2(int16x2_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+/* Packed Shifts (64-bit) */
+
+static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_u8x8(uint8x8_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_i8x8(int8x8_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_u16x4(uint16x4_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_i16x4(int16x4_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_u32x2(uint32x2_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psll_s_i32x2(int32x2_t __rs1, unsigned __shamt) {
+  return __rs1 << __shamt;
+}
+
+static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psrl_s_u8x8(uint8x8_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psrl_s_u16x4(uint16x4_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psrl_s_u32x2(uint32x2_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psra_s_i8x8(int8x8_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psra_s_i16x4(int16x4_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
+__riscv_psra_s_i32x2(int32x2_t __rs1, unsigned __shamt) {
+  return __rs1 >> __shamt;
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __RISCV_SIMD_H */
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
new file mode 100644
index 0000000000000..40a21fa071387
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -0,0 +1,1009 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple riscv32 -target-feature +experimental-p \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=sroa,instcombine | FileCheck %s --check-prefix=RV32
+// RUN: %clang_cc1 -triple riscv64 -target-feature +experimental-p \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=sroa,instcombine | FileCheck %s --check-prefix=RV64
+
+#include <riscv_simd.h>
+
+/* 32-bit Packed Addition and Subtraction */
+
+// RV32-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_padd_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_padd_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_padd_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_padd_u16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
+  return __riscv_psub_i8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
+  return __riscv_psub_u8x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
+  return __riscv_psub_i16x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
+  return __riscv_psub_u16x2(a, b);
+}
+
+/* 64-bit Packed Addition and Subtraction */
+
+// RV32-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_padd_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_padd_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_padd_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_padd_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_padd_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_padd_u32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
+  return __riscv_psub_i8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
+  return __riscv_psub_u8x8(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
+  return __riscv_psub_i16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
+  return __riscv_psub_u16x4(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
+  return __riscv_psub_i32x2(a, b);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
+//
+uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
+  return __riscv_psub_u32x2(a, b);
+}
+
+/* 32-bit Packed Shifts */
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
+  return __riscv_psll_s_i8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
+  return __riscv_psll_s_u8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
+  return __riscv_psll_s_i16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psll_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
+  return __riscv_psll_s_u16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psra_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
+  return __riscv_psra_s_i8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
+  return __riscv_psrl_s_u8x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psra_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
+  return __riscv_psra_s_i16x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
+  return __riscv_psrl_s_u16x2(a, shamt);
+}
+
+/* 64-bit Packed Shifts */
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
+  return __riscv_psll_s_i8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
+  return __riscv_psll_s_u8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
+  return __riscv_psll_s_i16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
+  return __riscv_psll_s_u16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
+  return __riscv_psll_s_i32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned shamt) {
+  return __riscv_psll_s_u32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
+  return __riscv_psra_s_i8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
+  return __riscv_psrl_s_u8x8(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
+  return __riscv_psra_s_i16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
+  return __riscv_psrl_s_u16x4(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
+  return __riscv_psra_s_i32x2(a, shamt);
+}
+
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned shamt) {
+  return __riscv_psrl_s_u32x2(a, shamt);
+}

>From e1abe5f3663a5277a5a9382a6e8f5b318cf40e6c Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Thu, 12 Feb 2026 17:36:06 +0800
Subject: [PATCH 2/8] [Clang][RISCV] Use direct vector types in IR for P
 extension

---
 clang/lib/CodeGen/Targets/RISCV.cpp       |    7 +
 clang/test/CodeGen/RISCV/rvp-intrinsics.c | 1084 ++++++++-------------
 2 files changed, 429 insertions(+), 662 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp
index d1345891e9fb6..01496c4bdbb54 100644
--- a/clang/lib/CodeGen/Targets/RISCV.cpp
+++ b/clang/lib/CodeGen/Targets/RISCV.cpp
@@ -714,6 +714,13 @@ ABIArgInfo RISCVABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
       // Generic vector without riscv_vls_cc should fall through and pass by
       // reference.
       return coerceVLSVector(Ty, ABIVLen);
+    if (getContext().getTargetInfo().hasFeature("experimental-p") &&
+        VT->getVectorKind() == VectorKind::Generic &&
+        VT->getElementType()->isIntegerType() && (Size == 32 || Size == 64)) {
+      uint64_t EltSize = getContext().getTypeSize(VT->getElementType());
+      if (EltSize == 8 || EltSize == 16 || EltSize == 32)
+        return ABIArgInfo::getDirect();
+    }
   }
 
   // Aggregates which are <= 2*XLen will be passed in registers if possible,
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index 40a21fa071387..87ad2aa034b8c 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -10,201 +10,129 @@
 
 /* 32-bit Packed Addition and Subtraction */
 
-// RV32-LABEL: define dso_local i32 @test_padd_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
-//
-// RV64-LABEL: define dso_local i64 @test_padd_i8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV32-LABEL: define dso_local <4 x i8> @test_padd_i8x4(
+// RV32-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[A]], [[B]]
+// RV32-NEXT:    ret <4 x i8> [[ADD_I]]
+//
+// RV64-LABEL: define dso_local <4 x i8> @test_padd_i8x4(
+// RV64-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[A]], [[B]]
+// RV64-NEXT:    ret <4 x i8> [[ADD_I]]
 //
 int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
   return __riscv_padd_i8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_padd_u8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i8> @test_padd_u8x4(
+// RV32-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[A]], [[B]]
+// RV32-NEXT:    ret <4 x i8> [[ADD_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i8> @test_padd_u8x4(
+// RV64-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[A]], [[B]]
+// RV64-NEXT:    ret <4 x i8> [[ADD_I]]
 //
 uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
   return __riscv_padd_u8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_padd_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i16> @test_padd_i16x2(
+// RV32-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[A]], [[B]]
+// RV32-NEXT:    ret <2 x i16> [[ADD_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_i16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i16> @test_padd_i16x2(
+// RV64-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[A]], [[B]]
+// RV64-NEXT:    ret <2 x i16> [[ADD_I]]
 //
 int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
   return __riscv_padd_i16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_padd_u16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i16> @test_padd_u16x2(
+// RV32-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[A]], [[B]]
+// RV32-NEXT:    ret <2 x i16> [[ADD_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i16> @test_padd_u16x2(
+// RV64-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[A]], [[B]]
+// RV64-NEXT:    ret <2 x i16> [[ADD_I]]
 //
 uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
   return __riscv_padd_u16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psub_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i8> @test_psub_i8x4(
+// RV32-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[A]], [[B]]
+// RV32-NEXT:    ret <4 x i8> [[SUB_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i8> @test_psub_i8x4(
+// RV64-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[A]], [[B]]
+// RV64-NEXT:    ret <4 x i8> [[SUB_I]]
 //
 int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
   return __riscv_psub_i8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psub_u8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i8> @test_psub_u8x4(
+// RV32-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[A]], [[B]]
+// RV32-NEXT:    ret <4 x i8> [[SUB_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i8> @test_psub_u8x4(
+// RV64-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[A]], [[B]]
+// RV64-NEXT:    ret <4 x i8> [[SUB_I]]
 //
 uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
   return __riscv_psub_u8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psub_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i16> @test_psub_i16x2(
+// RV32-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[A]], [[B]]
+// RV32-NEXT:    ret <2 x i16> [[SUB_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i16> @test_psub_i16x2(
+// RV64-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[A]], [[B]]
+// RV64-NEXT:    ret <2 x i16> [[SUB_I]]
 //
 int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
   return __riscv_psub_i16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psub_u16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i16> @test_psub_u16x2(
+// RV32-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP2]]
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[A]], [[B]]
+// RV32-NEXT:    ret <2 x i16> [[SUB_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i16> @test_psub_u16x2(
+// RV64-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
-// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[A]], [[B]]
+// RV64-NEXT:    ret <2 x i16> [[SUB_I]]
 //
 uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
   return __riscv_psub_u16x2(a, b);
@@ -212,265 +140,193 @@ uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
 
 /* 64-bit Packed Addition and Subtraction */
 
-// RV32-LABEL: define dso_local i64 @test_padd_i8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <8 x i8> @test_padd_i8x8(
+// RV32-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]]
+// RV32-NEXT:    ret <8 x i8> [[ADD_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_i8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <8 x i8> @test_padd_i8x8(
+// RV64-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]]
+// RV64-NEXT:    ret <8 x i8> [[ADD_I]]
 //
 int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
   return __riscv_padd_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_u8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <8 x i8> @test_padd_u8x8(
+// RV32-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]]
+// RV32-NEXT:    ret <8 x i8> [[ADD_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <8 x i8> @test_padd_u8x8(
+// RV64-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]]
+// RV64-NEXT:    ret <8 x i8> [[ADD_I]]
 //
 uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
   return __riscv_padd_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i16> @test_padd_i16x4(
+// RV32-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]]
+// RV32-NEXT:    ret <4 x i16> [[ADD_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i16> @test_padd_i16x4(
+// RV64-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]]
+// RV64-NEXT:    ret <4 x i16> [[ADD_I]]
 //
 int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
   return __riscv_padd_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_u16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i16> @test_padd_u16x4(
+// RV32-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]]
+// RV32-NEXT:    ret <4 x i16> [[ADD_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i16> @test_padd_u16x4(
+// RV64-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]]
+// RV64-NEXT:    ret <4 x i16> [[ADD_I]]
 //
 uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
   return __riscv_padd_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i32> @test_padd_i32x2(
+// RV32-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]]
+// RV32-NEXT:    ret <2 x i32> [[ADD_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i32> @test_padd_i32x2(
+// RV64-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]]
+// RV64-NEXT:    ret <2 x i32> [[ADD_I]]
 //
 int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
   return __riscv_padd_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_padd_u32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i32> @test_padd_u32x2(
+// RV32-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]]
+// RV32-NEXT:    ret <2 x i32> [[ADD_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_padd_u32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i32> @test_padd_u32x2(
+// RV64-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]]
+// RV64-NEXT:    ret <2 x i32> [[ADD_I]]
 //
 uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
   return __riscv_padd_u32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_i8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <8 x i8> @test_psub_i8x8(
+// RV32-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]]
+// RV32-NEXT:    ret <8 x i8> [[SUB_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <8 x i8> @test_psub_i8x8(
+// RV64-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]]
+// RV64-NEXT:    ret <8 x i8> [[SUB_I]]
 //
 int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
   return __riscv_psub_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_u8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <8 x i8> @test_psub_u8x8(
+// RV32-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]]
+// RV32-NEXT:    ret <8 x i8> [[SUB_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <8 x i8> @test_psub_u8x8(
+// RV64-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]]
+// RV64-NEXT:    ret <8 x i8> [[SUB_I]]
 //
 uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
   return __riscv_psub_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i16> @test_psub_i16x4(
+// RV32-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]]
+// RV32-NEXT:    ret <4 x i16> [[SUB_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i16> @test_psub_i16x4(
+// RV64-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]]
+// RV64-NEXT:    ret <4 x i16> [[SUB_I]]
 //
 int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
   return __riscv_psub_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_u16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i16> @test_psub_u16x4(
+// RV32-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]]
+// RV32-NEXT:    ret <4 x i16> [[SUB_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i16> @test_psub_u16x4(
+// RV64-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]]
+// RV64-NEXT:    ret <4 x i16> [[SUB_I]]
 //
 uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
   return __riscv_psub_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i32> @test_psub_i32x2(
+// RV32-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]]
+// RV32-NEXT:    ret <2 x i32> [[SUB_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i32> @test_psub_i32x2(
+// RV64-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]]
+// RV64-NEXT:    ret <2 x i32> [[SUB_I]]
 //
 int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
   return __riscv_psub_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psub_u32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i32> @test_psub_u32x2(
+// RV32-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
-// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP2]]
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]]
+// RV32-NEXT:    ret <2 x i32> [[SUB_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psub_u32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i32> @test_psub_u32x2(
+// RV64-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
-// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP2]]
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]]
+// RV64-NEXT:    ret <2 x i32> [[SUB_I]]
 //
 uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
   return __riscv_psub_u32x2(a, b);
@@ -478,225 +334,177 @@ uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
 
 /* 32-bit Packed Shifts */
 
-// RV32-LABEL: define dso_local i32 @test_psll_s_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
-//
-// RV64-LABEL: define dso_local i64 @test_psll_s_i8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV32-LABEL: define dso_local <4 x i8> @test_psll_s_i8x4(
+// RV32-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <4 x i8> [[SHL_I]]
+//
+// RV64-LABEL: define dso_local <4 x i8> @test_psll_s_i8x4(
+// RV64-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <4 x i8> [[SHL_I]]
 //
 int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
   return __riscv_psll_s_i8x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psll_s_u8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i8> @test_psll_s_u8x4(
+// RV32-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <4 x i8> [[SHL_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i8> @test_psll_s_u8x4(
+// RV64-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <4 x i8> [[SHL_I]]
 //
 uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
   return __riscv_psll_s_u8x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psll_s_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i16> @test_psll_s_i16x2(
+// RV32-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <2 x i16> [[SHL_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_i16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i16> @test_psll_s_i16x2(
+// RV64-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <2 x i16> [[SHL_I]]
 //
 int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
   return __riscv_psll_s_i16x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psll_s_u16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i16> @test_psll_s_u16x2(
+// RV32-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <2 x i16> [[SHL_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i16> @test_psll_s_u16x2(
+// RV64-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <2 x i16> [[SHL_I]]
 //
 uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
   return __riscv_psll_s_u16x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psra_s_i8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i8> @test_psra_s_i8x4(
+// RV32-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <4 x i8> [[SHR_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i8> @test_psra_s_i8x4(
+// RV64-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <4 x i8> [[SHR_I]]
 //
 int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
   return __riscv_psra_s_i8x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psrl_s_u8x4(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i8> @test_psrl_s_u8x4(
+// RV32-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <4 x i8> [[SHR_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x4(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i8> @test_psrl_s_u8x4(
+// RV64-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <4 x i8> [[SHR_I]]
 //
 uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
   return __riscv_psrl_s_u8x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psra_s_i16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i16> @test_psra_s_i16x2(
+// RV32-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <2 x i16> [[SHR_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i16> @test_psra_s_i16x2(
+// RV64-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <2 x i16> [[SHR_I]]
 //
 int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
   return __riscv_psra_s_i16x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i32 @test_psrl_s_u16x2(
-// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i16> @test_psrl_s_u16x2(
+// RV32-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <2 x i16> [[SHR_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x2(
-// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i16> @test_psrl_s_u16x2(
+// RV64-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
-// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <2 x i16> [[SHR_I]]
 //
 uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
   return __riscv_psrl_s_u16x2(a, shamt);
@@ -704,305 +512,257 @@ uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
 
 /* 64-bit Packed Shifts */
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_i8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <8 x i8> @test_psll_s_i8x8(
+// RV32-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <8 x i8> [[SHL_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_i8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <8 x i8> @test_psll_s_i8x8(
+// RV64-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <8 x i8> [[SHL_I]]
 //
 int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
   return __riscv_psll_s_i8x8(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_u8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <8 x i8> @test_psll_s_u8x8(
+// RV32-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <8 x i8> [[SHL_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <8 x i8> @test_psll_s_u8x8(
+// RV64-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <8 x i8> [[SHL_I]]
 //
 uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
   return __riscv_psll_s_u8x8(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i16> @test_psll_s_i16x4(
+// RV32-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <4 x i16> [[SHL_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i16> @test_psll_s_i16x4(
+// RV64-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <4 x i16> [[SHL_I]]
 //
 int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
   return __riscv_psll_s_i16x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_u16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i16> @test_psll_s_u16x4(
+// RV32-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <4 x i16> [[SHL_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i16> @test_psll_s_u16x4(
+// RV64-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <4 x i16> [[SHL_I]]
 //
 uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
   return __riscv_psll_s_u16x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i32> @test_psll_s_i32x2(
+// RV32-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    ret <2 x i32> [[SHL_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i32> @test_psll_s_i32x2(
+// RV64-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    ret <2 x i32> [[SHL_I]]
 //
 int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
   return __riscv_psll_s_i32x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psll_s_u32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i32> @test_psll_s_u32x2(
+// RV32-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    ret <2 x i32> [[SHL_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psll_s_u32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i32> @test_psll_s_u32x2(
+// RV64-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    ret <2 x i32> [[SHL_I]]
 //
 uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned shamt) {
   return __riscv_psll_s_u32x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psra_s_i8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <8 x i8> @test_psra_s_i8x8(
+// RV32-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <8 x i8> [[SHR_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <8 x i8> @test_psra_s_i8x8(
+// RV64-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <8 x i8> [[SHR_I]]
 //
 int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
   return __riscv_psra_s_i8x8(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psrl_s_u8x8(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <8 x i8> @test_psrl_s_u8x8(
+// RV32-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <8 x i8> [[SHR_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x8(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <8 x i8> @test_psrl_s_u8x8(
+// RV64-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <8 x i8> [[SHR_I]]
 //
 uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
   return __riscv_psrl_s_u8x8(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psra_s_i16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i16> @test_psra_s_i16x4(
+// RV32-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <4 x i16> [[SHR_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i16> @test_psra_s_i16x4(
+// RV64-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <4 x i16> [[SHR_I]]
 //
 int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
   return __riscv_psra_s_i16x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psrl_s_u16x4(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <4 x i16> @test_psrl_s_u16x4(
+// RV32-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[A]], [[SH_PROM_I]]
+// RV32-NEXT:    ret <4 x i16> [[SHR_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x4(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <4 x i16> @test_psrl_s_u16x4(
+// RV64-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
-// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[A]], [[SH_PROM_I]]
+// RV64-NEXT:    ret <4 x i16> [[SHR_I]]
 //
 uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
   return __riscv_psrl_s_u16x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psra_s_i32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i32> @test_psra_s_i32x2(
+// RV32-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    ret <2 x i32> [[SHR_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psra_s_i32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i32> @test_psra_s_i32x2(
+// RV64-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    ret <2 x i32> [[SHR_I]]
 //
 int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
   return __riscv_psra_s_i32x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local i64 @test_psrl_s_u32x2(
-// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local <2 x i32> @test_psrl_s_u32x2(
+// RV32-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP1]]
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    ret <2 x i32> [[SHR_I]]
 //
-// RV64-LABEL: define dso_local i64 @test_psrl_s_u32x2(
-// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local <2 x i32> @test_psrl_s_u32x2(
+// RV64-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP1]]
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    ret <2 x i32> [[SHR_I]]
 //
 uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned shamt) {
   return __riscv_psrl_s_u32x2(a, shamt);

>From 4d77f0dfe828d455d850da7940214db3374ab085 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Fri, 13 Feb 2026 10:00:53 +0800
Subject: [PATCH 3/8] Revert "[Clang][RISCV] Use direct vector types in IR for
 P extension"

This reverts commit e1abe5f3663a5277a5a9382a6e8f5b318cf40e6c.
---
 clang/lib/CodeGen/Targets/RISCV.cpp       |    7 -
 clang/test/CodeGen/RISCV/rvp-intrinsics.c | 1084 +++++++++++++--------
 2 files changed, 662 insertions(+), 429 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp
index 01496c4bdbb54..d1345891e9fb6 100644
--- a/clang/lib/CodeGen/Targets/RISCV.cpp
+++ b/clang/lib/CodeGen/Targets/RISCV.cpp
@@ -714,13 +714,6 @@ ABIArgInfo RISCVABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
       // Generic vector without riscv_vls_cc should fall through and pass by
       // reference.
       return coerceVLSVector(Ty, ABIVLen);
-    if (getContext().getTargetInfo().hasFeature("experimental-p") &&
-        VT->getVectorKind() == VectorKind::Generic &&
-        VT->getElementType()->isIntegerType() && (Size == 32 || Size == 64)) {
-      uint64_t EltSize = getContext().getTypeSize(VT->getElementType());
-      if (EltSize == 8 || EltSize == 16 || EltSize == 32)
-        return ABIArgInfo::getDirect();
-    }
   }
 
   // Aggregates which are <= 2*XLen will be passed in registers if possible,
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index 87ad2aa034b8c..40a21fa071387 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -10,129 +10,201 @@
 
 /* 32-bit Packed Addition and Subtraction */
 
-// RV32-LABEL: define dso_local <4 x i8> @test_padd_i8x4(
-// RV32-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[A]], [[B]]
-// RV32-NEXT:    ret <4 x i8> [[ADD_I]]
-//
-// RV64-LABEL: define dso_local <4 x i8> @test_padd_i8x4(
-// RV64-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[A]], [[B]]
-// RV64-NEXT:    ret <4 x i8> [[ADD_I]]
+// RV32-LABEL: define dso_local i32 @test_padd_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
+//
+// RV64-LABEL: define dso_local i64 @test_padd_i8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int8x4_t test_padd_i8x4(int8x4_t a, int8x4_t b) {
   return __riscv_padd_i8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local <4 x i8> @test_padd_u8x4(
-// RV32-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_padd_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[A]], [[B]]
-// RV32-NEXT:    ret <4 x i8> [[ADD_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <4 x i8> @test_padd_u8x4(
-// RV64-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_u8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[A]], [[B]]
-// RV64-NEXT:    ret <4 x i8> [[ADD_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[ADD_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint8x4_t test_padd_u8x4(uint8x4_t a, uint8x4_t b) {
   return __riscv_padd_u8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local <2 x i16> @test_padd_i16x2(
-// RV32-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_padd_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[A]], [[B]]
-// RV32-NEXT:    ret <2 x i16> [[ADD_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <2 x i16> @test_padd_i16x2(
-// RV64-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_i16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[A]], [[B]]
-// RV64-NEXT:    ret <2 x i16> [[ADD_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int16x2_t test_padd_i16x2(int16x2_t a, int16x2_t b) {
   return __riscv_padd_i16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local <2 x i16> @test_padd_u16x2(
-// RV32-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_padd_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[A]], [[B]]
-// RV32-NEXT:    ret <2 x i16> [[ADD_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <2 x i16> @test_padd_u16x2(
-// RV64-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_u16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[A]], [[B]]
-// RV64-NEXT:    ret <2 x i16> [[ADD_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[ADD_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint16x2_t test_padd_u16x2(uint16x2_t a, uint16x2_t b) {
   return __riscv_padd_u16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local <4 x i8> @test_psub_i8x4(
-// RV32-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psub_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[A]], [[B]]
-// RV32-NEXT:    ret <4 x i8> [[SUB_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <4 x i8> @test_psub_i8x4(
-// RV64-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_i8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[A]], [[B]]
-// RV64-NEXT:    ret <4 x i8> [[SUB_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int8x4_t test_psub_i8x4(int8x4_t a, int8x4_t b) {
   return __riscv_psub_i8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local <4 x i8> @test_psub_u8x4(
-// RV32-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psub_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[A]], [[B]]
-// RV32-NEXT:    ret <4 x i8> [[SUB_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <4 x i8> @test_psub_u8x4(
-// RV64-SAME: <4 x i8> noundef [[A:%.*]], <4 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_u8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[A]], [[B]]
-// RV64-NEXT:    ret <4 x i8> [[SUB_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <4 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[SUB_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint8x4_t test_psub_u8x4(uint8x4_t a, uint8x4_t b) {
   return __riscv_psub_u8x4(a, b);
 }
 
-// RV32-LABEL: define dso_local <2 x i16> @test_psub_i16x2(
-// RV32-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psub_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[A]], [[B]]
-// RV32-NEXT:    ret <2 x i16> [[SUB_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <2 x i16> @test_psub_i16x2(
-// RV64-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_i16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[A]], [[B]]
-// RV64-NEXT:    ret <2 x i16> [[SUB_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int16x2_t test_psub_i16x2(int16x2_t a, int16x2_t b) {
   return __riscv_psub_i16x2(a, b);
 }
 
-// RV32-LABEL: define dso_local <2 x i16> @test_psub_u16x2(
-// RV32-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psub_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[A]], [[B]]
-// RV32-NEXT:    ret <2 x i16> [[SUB_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <2 x i16> @test_psub_u16x2(
-// RV64-SAME: <2 x i16> noundef [[A:%.*]], <2 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_u16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i64 [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[A]], [[B]]
-// RV64-NEXT:    ret <2 x i16> [[SUB_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1_I:%.*]] = trunc i64 [[B_COERCE]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1_I]] to <2 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[SUB_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
   return __riscv_psub_u16x2(a, b);
@@ -140,193 +212,265 @@ uint16x2_t test_psub_u16x2(uint16x2_t a, uint16x2_t b) {
 
 /* 64-bit Packed Addition and Subtraction */
 
-// RV32-LABEL: define dso_local <8 x i8> @test_padd_i8x8(
-// RV32-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]]
-// RV32-NEXT:    ret <8 x i8> [[ADD_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <8 x i8> @test_padd_i8x8(
-// RV64-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]]
-// RV64-NEXT:    ret <8 x i8> [[ADD_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
 int8x8_t test_padd_i8x8(int8x8_t a, int8x8_t b) {
   return __riscv_padd_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local <8 x i8> @test_padd_u8x8(
-// RV32-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]]
-// RV32-NEXT:    ret <8 x i8> [[ADD_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <8 x i8> @test_padd_u8x8(
-// RV64-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]]
-// RV64-NEXT:    ret <8 x i8> [[ADD_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
 uint8x8_t test_padd_u8x8(uint8x8_t a, uint8x8_t b) {
   return __riscv_padd_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local <4 x i16> @test_padd_i16x4(
-// RV32-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]]
-// RV32-NEXT:    ret <4 x i16> [[ADD_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <4 x i16> @test_padd_i16x4(
-// RV64-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]]
-// RV64-NEXT:    ret <4 x i16> [[ADD_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
 int16x4_t test_padd_i16x4(int16x4_t a, int16x4_t b) {
   return __riscv_padd_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local <4 x i16> @test_padd_u16x4(
-// RV32-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]]
-// RV32-NEXT:    ret <4 x i16> [[ADD_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <4 x i16> @test_padd_u16x4(
-// RV64-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]]
-// RV64-NEXT:    ret <4 x i16> [[ADD_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
 uint16x4_t test_padd_u16x4(uint16x4_t a, uint16x4_t b) {
   return __riscv_padd_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local <2 x i32> @test_padd_i32x2(
-// RV32-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]]
-// RV32-NEXT:    ret <2 x i32> [[ADD_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <2 x i32> @test_padd_i32x2(
-// RV64-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]]
-// RV64-NEXT:    ret <2 x i32> [[ADD_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
 int32x2_t test_padd_i32x2(int32x2_t a, int32x2_t b) {
   return __riscv_padd_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local <2 x i32> @test_padd_u32x2(
-// RV32-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]]
-// RV32-NEXT:    ret <2 x i32> [[ADD_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <2 x i32> @test_padd_u32x2(
-// RV64-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_padd_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]]
-// RV64-NEXT:    ret <2 x i32> [[ADD_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[ADD_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
 uint32x2_t test_padd_u32x2(uint32x2_t a, uint32x2_t b) {
   return __riscv_padd_u32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local <8 x i8> @test_psub_i8x8(
-// RV32-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]]
-// RV32-NEXT:    ret <8 x i8> [[SUB_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <8 x i8> @test_psub_i8x8(
-// RV64-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]]
-// RV64-NEXT:    ret <8 x i8> [[SUB_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
 int8x8_t test_psub_i8x8(int8x8_t a, int8x8_t b) {
   return __riscv_psub_i8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local <8 x i8> @test_psub_u8x8(
-// RV32-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]]
-// RV32-NEXT:    ret <8 x i8> [[SUB_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <8 x i8> @test_psub_u8x8(
-// RV64-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]]
-// RV64-NEXT:    ret <8 x i8> [[SUB_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
 uint8x8_t test_psub_u8x8(uint8x8_t a, uint8x8_t b) {
   return __riscv_psub_u8x8(a, b);
 }
 
-// RV32-LABEL: define dso_local <4 x i16> @test_psub_i16x4(
-// RV32-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]]
-// RV32-NEXT:    ret <4 x i16> [[SUB_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <4 x i16> @test_psub_i16x4(
-// RV64-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]]
-// RV64-NEXT:    ret <4 x i16> [[SUB_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
 int16x4_t test_psub_i16x4(int16x4_t a, int16x4_t b) {
   return __riscv_psub_i16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local <4 x i16> @test_psub_u16x4(
-// RV32-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]]
-// RV32-NEXT:    ret <4 x i16> [[SUB_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <4 x i16> @test_psub_u16x4(
-// RV64-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]]
-// RV64-NEXT:    ret <4 x i16> [[SUB_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
 uint16x4_t test_psub_u16x4(uint16x4_t a, uint16x4_t b) {
   return __riscv_psub_u16x4(a, b);
 }
 
-// RV32-LABEL: define dso_local <2 x i32> @test_psub_i32x2(
-// RV32-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]]
-// RV32-NEXT:    ret <2 x i32> [[SUB_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <2 x i32> @test_psub_i32x2(
-// RV64-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]]
-// RV64-NEXT:    ret <2 x i32> [[SUB_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
 int32x2_t test_psub_i32x2(int32x2_t a, int32x2_t b) {
   return __riscv_psub_i32x2(a, b);
 }
 
-// RV32-LABEL: define dso_local <2 x i32> @test_psub_u32x2(
-// RV32-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]]
-// RV32-NEXT:    ret <2 x i32> [[SUB_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV32-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP2]]
 //
-// RV64-LABEL: define dso_local <2 x i32> @test_psub_u32x2(
-// RV64-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psub_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]]
-// RV64-NEXT:    ret <2 x i32> [[SUB_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[TMP0]], [[TMP1]]
+// RV64-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SUB_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP2]]
 //
 uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
   return __riscv_psub_u32x2(a, b);
@@ -334,177 +478,225 @@ uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
 
 /* 32-bit Packed Shifts */
 
-// RV32-LABEL: define dso_local <4 x i8> @test_psll_s_i8x4(
-// RV32-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
-// RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <4 x i8> [[SHL_I]]
-//
-// RV64-LABEL: define dso_local <4 x i8> @test_psll_s_i8x4(
-// RV64-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
-// RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <4 x i8> [[SHL_I]]
+// RV32-LABEL: define dso_local i32 @test_psll_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: define dso_local i64 @test_psll_s_i8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
   return __riscv_psll_s_i8x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <4 x i8> @test_psll_s_u8x4(
-// RV32-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psll_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <4 x i8> [[SHL_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <4 x i8> @test_psll_s_u8x4(
-// RV64-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_u8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <4 x i8> [[SHL_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
   return __riscv_psll_s_u8x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <2 x i16> @test_psll_s_i16x2(
-// RV32-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psll_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <2 x i16> [[SHL_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <2 x i16> @test_psll_s_i16x2(
-// RV64-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_i16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <2 x i16> [[SHL_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
   return __riscv_psll_s_i16x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <2 x i16> @test_psll_s_u16x2(
-// RV32-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psll_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <2 x i16> [[SHL_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <2 x i16> @test_psll_s_u16x2(
-// RV64-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_u16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <2 x i16> [[SHL_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
   return __riscv_psll_s_u16x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <4 x i8> @test_psra_s_i8x4(
-// RV32-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psra_s_i8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <4 x i8> [[SHR_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <4 x i8> @test_psra_s_i8x4(
-// RV64-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psra_s_i8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <4 x i8> [[SHR_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
   return __riscv_psra_s_i8x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <4 x i8> @test_psrl_s_u8x4(
-// RV32-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u8x4(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <4 x i8> [[SHR_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <4 x i8> @test_psrl_s_u8x4(
-// RV64-SAME: <4 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x4(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <4 x i8> [[SHR_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
   return __riscv_psrl_s_u8x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <2 x i16> @test_psra_s_i16x2(
-// RV32-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psra_s_i16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <2 x i16> [[SHR_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <2 x i16> @test_psra_s_i16x2(
-// RV64-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psra_s_i16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <2 x i16> [[SHR_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
   return __riscv_psra_s_i16x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <2 x i16> @test_psrl_s_u16x2(
-// RV32-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i32 @test_psrl_s_u16x2(
+// RV32-SAME: i32 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <2 x i16> [[SHR_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <2 x i16> @test_psrl_s_u16x2(
-// RV64-SAME: <2 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x2(
+// RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <2 x i16> [[SHR_I]]
+// RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
   return __riscv_psrl_s_u16x2(a, shamt);
@@ -512,257 +704,305 @@ uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
 
 /* 64-bit Packed Shifts */
 
-// RV32-LABEL: define dso_local <8 x i8> @test_psll_s_i8x8(
-// RV32-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <8 x i8> [[SHL_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <8 x i8> @test_psll_s_i8x8(
-// RV64-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <8 x i8> [[SHL_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
 //
 int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
   return __riscv_psll_s_i8x8(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <8 x i8> @test_psll_s_u8x8(
-// RV32-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <8 x i8> [[SHL_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <8 x i8> @test_psll_s_u8x8(
-// RV64-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <8 x i8> [[SHL_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
 //
 uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
   return __riscv_psll_s_u8x8(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <4 x i16> @test_psll_s_i16x4(
-// RV32-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <4 x i16> [[SHL_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <4 x i16> @test_psll_s_i16x4(
-// RV64-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <4 x i16> [[SHL_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
 //
 int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
   return __riscv_psll_s_i16x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <4 x i16> @test_psll_s_u16x4(
-// RV32-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <4 x i16> [[SHL_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <4 x i16> @test_psll_s_u16x4(
-// RV64-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <4 x i16> [[SHL_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
 //
 uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
   return __riscv_psll_s_u16x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <2 x i32> @test_psll_s_i32x2(
-// RV32-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    ret <2 x i32> [[SHL_I]]
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local <2 x i32> @test_psll_s_i32x2(
-// RV64-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    ret <2 x i32> [[SHL_I]]
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
 int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
   return __riscv_psll_s_i32x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <2 x i32> @test_psll_s_u32x2(
-// RV32-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    ret <2 x i32> [[SHL_I]]
+// RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local <2 x i32> @test_psll_s_u32x2(
-// RV64-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psll_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    ret <2 x i32> [[SHL_I]]
+// RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
 uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned shamt) {
   return __riscv_psll_s_u32x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <8 x i8> @test_psra_s_i8x8(
-// RV32-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <8 x i8> [[SHR_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <8 x i8> @test_psra_s_i8x8(
-// RV64-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psra_s_i8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <8 x i8> [[SHR_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
 //
 int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
   return __riscv_psra_s_i8x8(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <8 x i8> @test_psrl_s_u8x8(
-// RV32-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <8 x i8> [[SHR_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <8 x i8> @test_psrl_s_u8x8(
-// RV64-SAME: <8 x i8> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u8x8(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <8 x i8> [[SHR_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
 //
 uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
   return __riscv_psrl_s_u8x8(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <4 x i16> @test_psra_s_i16x4(
-// RV32-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <4 x i16> [[SHR_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <4 x i16> @test_psra_s_i16x4(
-// RV64-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psra_s_i16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <4 x i16> [[SHR_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
 //
 int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
   return __riscv_psra_s_i16x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <4 x i16> @test_psrl_s_u16x4(
-// RV32-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
-// RV32-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[A]], [[SH_PROM_I]]
-// RV32-NEXT:    ret <4 x i16> [[SHR_I]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
 //
-// RV64-LABEL: define dso_local <4 x i16> @test_psrl_s_u16x4(
-// RV64-SAME: <4 x i16> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u16x4(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
-// RV64-NEXT:    [[TMP0:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[A]], [[SH_PROM_I]]
-// RV64-NEXT:    ret <4 x i16> [[SHR_I]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
+// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
 //
 uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
   return __riscv_psrl_s_u16x4(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <2 x i32> @test_psra_s_i32x2(
-// RV32-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    ret <2 x i32> [[SHR_I]]
+// RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local <2 x i32> @test_psra_s_i32x2(
-// RV64-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psra_s_i32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    ret <2 x i32> [[SHR_I]]
+// RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
 int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
   return __riscv_psra_s_i32x2(a, shamt);
 }
 
-// RV32-LABEL: define dso_local <2 x i32> @test_psrl_s_u32x2(
-// RV32-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV32-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
-// RV32-NEXT:    ret <2 x i32> [[SHR_I]]
+// RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP1]]
 //
-// RV64-LABEL: define dso_local <2 x i32> @test_psrl_s_u32x2(
-// RV64-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
+// RV64-LABEL: define dso_local i64 @test_psrl_s_u32x2(
+// RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
 // RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
-// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[A]], [[SPLAT_SPLAT_I]]
-// RV64-NEXT:    ret <2 x i32> [[SHR_I]]
+// RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP1]]
 //
 uint32x2_t test_psrl_s_u32x2(uint32x2_t a, unsigned shamt) {
   return __riscv_psrl_s_u32x2(a, shamt);

>From 27ed41c626b955fff6e843c78e00be9a92b1afa8 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Fri, 13 Feb 2026 10:14:32 +0800
Subject: [PATCH 4/8] Fix list alphabetization and line alignment

---
 clang/lib/Headers/CMakeLists.txt | 6 +++---
 clang/lib/Headers/riscv_simd.h   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 76574d7a937e8..87002faffcb93 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -129,15 +129,15 @@ set(ppc_htm_files
   )
 
 set(riscv_files
+  andes_vector.h
   riscv_bitmanip.h
   riscv_corev_alu.h
   riscv_crypto.h
+  riscv_mips.h
   riscv_nds.h
   riscv_ntlh.h
-  sifive_vector.h
-  andes_vector.h
-  riscv_mips.h
   riscv_simd.h
+  sifive_vector.h
   )
 
 set(spirv_files
diff --git a/clang/lib/Headers/riscv_simd.h b/clang/lib/Headers/riscv_simd.h
index 262f35b483cbd..21d4d01628562 100644
--- a/clang/lib/Headers/riscv_simd.h
+++ b/clang/lib/Headers/riscv_simd.h
@@ -1,4 +1,4 @@
-/*===---- riscv_simd.h - RISC-V P intrinsics -----------------===
+/*===---- riscv_simd.h - RISC-V P intrinsics -------------------------------===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.

>From 862d2dfb301cfd6cf7e33e24e26bba5c35295945 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Tue, 24 Feb 2026 14:24:08 +0800
Subject: [PATCH 5/8] Rename P-extension header to riscv_packed.h

---
 clang/lib/Headers/CMakeLists.txt                   | 2 +-
 clang/lib/Headers/{riscv_simd.h => riscv_packed.h} | 8 ++++----
 clang/test/CodeGen/RISCV/rvp-intrinsics.c          | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)
 rename clang/lib/Headers/{riscv_simd.h => riscv_packed.h} (98%)

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 87002faffcb93..055b1b1d261e2 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -136,7 +136,7 @@ set(riscv_files
   riscv_mips.h
   riscv_nds.h
   riscv_ntlh.h
-  riscv_simd.h
+  riscv_packed.h
   sifive_vector.h
   )
 
diff --git a/clang/lib/Headers/riscv_simd.h b/clang/lib/Headers/riscv_packed.h
similarity index 98%
rename from clang/lib/Headers/riscv_simd.h
rename to clang/lib/Headers/riscv_packed.h
index 21d4d01628562..ef2d7b878c2eb 100644
--- a/clang/lib/Headers/riscv_simd.h
+++ b/clang/lib/Headers/riscv_packed.h
@@ -1,4 +1,4 @@
-/*===---- riscv_simd.h - RISC-V P intrinsics -------------------------------===
+/*===---- riscv_packed.h - RISC-V P intrinsics -----------------------------===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -7,8 +7,8 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef __RISCV_SIMD_H
-#define __RISCV_SIMD_H
+#ifndef __RISCV_PACKED_H
+#define __RISCV_PACKED_H
 
 #include <stdint.h>
 
@@ -242,4 +242,4 @@ __riscv_psra_s_i32x2(int32x2_t __rs1, unsigned __shamt) {
 }
 #endif
 
-#endif /* __RISCV_SIMD_H */
+#endif /* __RISCV_PACKED_H */
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index 40a21fa071387..c80a6ad4e95e7 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -6,7 +6,7 @@
 // RUN:   -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -passes=sroa,instcombine | FileCheck %s --check-prefix=RV64
 
-#include <riscv_simd.h>
+#include <riscv_packed.h>
 
 /* 32-bit Packed Addition and Subtraction */
 

>From 4a5dec9e1cfe15b16cdf8dce22ddc967a9ccf1cf Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Wed, 25 Feb 2026 17:26:03 +0800
Subject: [PATCH 6/8] [RISCV] Refactor P-extension intrinsics with macros

Co-authored-by: Alexander Richardson <alexrichardson at google.com>
---
 clang/lib/Headers/riscv_packed.h | 251 +++++++------------------------
 1 file changed, 52 insertions(+), 199 deletions(-)

diff --git a/clang/lib/Headers/riscv_packed.h b/clang/lib/Headers/riscv_packed.h
index ef2d7b878c2eb..07822f257630e 100644
--- a/clang/lib/Headers/riscv_packed.h
+++ b/clang/lib/Headers/riscv_packed.h
@@ -30,213 +30,66 @@ typedef uint16_t uint16x4_t __attribute__((vector_size(8)));
 typedef int32_t int32x2_t __attribute__((vector_size(8)));
 typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
 
-/* Packed Addition and Subtraction (32-bit) */
-
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_i8x4(int8x4_t __rs1, int8x4_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_u8x4(uint8x4_t __rs1, uint8x4_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_i16x2(int16x2_t __rs1, int16x2_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_u16x2(uint16x2_t __rs1, uint16x2_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_i8x4(int8x4_t __rs1, int8x4_t __rs2) {
-  return __rs1 - __rs2;
-}
+#define _packed_binop(name, retty, ty1, ty2, op)                               \
+  static __inline__ retty __attribute__((__always_inline__, __nodebug__))      \
+  __riscv_##name(ty1 __rs1, ty2 __rs2) {                                       \
+    return __rs1 op __rs2;                                                     \
+  }
 
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_u8x4(uint8x4_t __rs1, uint8x4_t __rs2) {
-  return __rs1 - __rs2;
-}
+#define _packed_addsub(name, ty, op) _packed_binop(name, ty, ty, ty, op)
+#define _packed_shift(name, ty, op) _packed_binop(name, ty, ty, unsigned, op)
 
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_i16x2(int16x2_t __rs1, int16x2_t __rs2) {
-  return __rs1 - __rs2;
-}
-
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_u16x2(uint16x2_t __rs1, uint16x2_t __rs2) {
-  return __rs1 - __rs2;
-}
+/* Packed Addition and Subtraction (32-bit) */
+_packed_addsub(padd_i8x4, int8x4_t, +)
+_packed_addsub(padd_u8x4, uint8x4_t, +)
+_packed_addsub(padd_i16x2, int16x2_t, +)
+_packed_addsub(padd_u16x2, uint16x2_t, +)
+_packed_addsub(psub_i8x4, int8x4_t, -)
+_packed_addsub(psub_u8x4, uint8x4_t, -)
+_packed_addsub(psub_i16x2, int16x2_t, -)
+_packed_addsub(psub_u16x2, uint16x2_t, -)
 
 /* Packed Addition and Subtraction (64-bit) */
-
-static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_i8x8(int8x8_t __rs1, int8x8_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_u8x8(uint8x8_t __rs1, uint8x8_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_i16x4(int16x4_t __rs1, int16x4_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_u16x4(uint16x4_t __rs1, uint16x4_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_i32x2(int32x2_t __rs1, int32x2_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_padd_u32x2(uint32x2_t __rs1, uint32x2_t __rs2) {
-  return __rs1 + __rs2;
-}
-
-static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_i8x8(int8x8_t __rs1, int8x8_t __rs2) {
-  return __rs1 - __rs2;
-}
-
-static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_u8x8(uint8x8_t __rs1, uint8x8_t __rs2) {
-  return __rs1 - __rs2;
-}
-
-static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_i16x4(int16x4_t __rs1, int16x4_t __rs2) {
-  return __rs1 - __rs2;
-}
-
-static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_u16x4(uint16x4_t __rs1, uint16x4_t __rs2) {
-  return __rs1 - __rs2;
-}
-
-static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_i32x2(int32x2_t __rs1, int32x2_t __rs2) {
-  return __rs1 - __rs2;
-}
-
-static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psub_u32x2(uint32x2_t __rs1, uint32x2_t __rs2) {
-  return __rs1 - __rs2;
-}
+_packed_addsub(padd_i8x8, int8x8_t, +)
+_packed_addsub(padd_u8x8, uint8x8_t, +)
+_packed_addsub(padd_i16x4, int16x4_t, +)
+_packed_addsub(padd_u16x4, uint16x4_t, +)
+_packed_addsub(padd_i32x2, int32x2_t, +)
+_packed_addsub(padd_u32x2, uint32x2_t, +)
+_packed_addsub(psub_i8x8, int8x8_t, -)
+_packed_addsub(psub_u8x8, uint8x8_t, -)
+_packed_addsub(psub_i16x4, int16x4_t, -)
+_packed_addsub(psub_u16x4, uint16x4_t, -)
+_packed_addsub(psub_i32x2, int32x2_t, -)
+_packed_addsub(psub_u32x2, uint32x2_t, -)
 
 /* Packed Shifts (32-bit) */
-
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_u8x4(uint8x4_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_i8x4(int8x4_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_u16x2(uint16x2_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_i16x2(int16x2_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psrl_s_u8x4(uint8x4_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psrl_s_u16x2(uint16x2_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psra_s_i8x4(int8x4_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psra_s_i16x2(int16x2_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
+_packed_shift(psll_s_u8x4, uint8x4_t, <<)
+_packed_shift(psll_s_i8x4, int8x4_t, <<)
+_packed_shift(psll_s_u16x2, uint16x2_t, <<)
+_packed_shift(psll_s_i16x2, int16x2_t, <<)
+_packed_shift(psrl_s_u8x4, uint8x4_t, >>)
+_packed_shift(psrl_s_u16x2, uint16x2_t, >>)
+_packed_shift(psra_s_i8x4, int8x4_t, >>)
+_packed_shift(psra_s_i16x2, int16x2_t, >>)
 
 /* Packed Shifts (64-bit) */
-
-static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_u8x8(uint8x8_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_i8x8(int8x8_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_u16x4(uint16x4_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_i16x4(int16x4_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_u32x2(uint32x2_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psll_s_i32x2(int32x2_t __rs1, unsigned __shamt) {
-  return __rs1 << __shamt;
-}
-
-static __inline__ uint8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psrl_s_u8x8(uint8x8_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ uint16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psrl_s_u16x4(uint16x4_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ uint32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psrl_s_u32x2(uint32x2_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ int8x8_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psra_s_i8x8(int8x8_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ int16x4_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psra_s_i16x4(int16x4_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
-
-static __inline__ int32x2_t __attribute__((__always_inline__, __nodebug__))
-__riscv_psra_s_i32x2(int32x2_t __rs1, unsigned __shamt) {
-  return __rs1 >> __shamt;
-}
+_packed_shift(psll_s_u8x8, uint8x8_t, <<)
+_packed_shift(psll_s_i8x8, int8x8_t, <<)
+_packed_shift(psll_s_u16x4, uint16x4_t, <<)
+_packed_shift(psll_s_i16x4, int16x4_t, <<)
+_packed_shift(psll_s_u32x2, uint32x2_t, <<)
+_packed_shift(psll_s_i32x2, int32x2_t, <<)
+_packed_shift(psrl_s_u8x8, uint8x8_t, >>)
+_packed_shift(psrl_s_u16x4, uint16x4_t, >>)
+_packed_shift(psrl_s_u32x2, uint32x2_t, >>)
+_packed_shift(psra_s_i8x8, int8x8_t, >>)
+_packed_shift(psra_s_i16x4, int16x4_t, >>)
+_packed_shift(psra_s_i32x2, int32x2_t, >>)
+
+#undef _packed_addsub
+#undef _packed_shift
+#undef _packed_binop
 
 #if defined(__cplusplus)
 }

>From dc733b6330e0494cc992f7ccd361296e6d6b4280 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Mon, 2 Mar 2026 14:05:56 +0800
Subject: [PATCH 7/8] [RISCV] Standardize P-extension intrinsics macros and
 types

---
 clang/lib/Headers/riscv_packed.h | 112 +++++++++++++++----------------
 1 file changed, 56 insertions(+), 56 deletions(-)

diff --git a/clang/lib/Headers/riscv_packed.h b/clang/lib/Headers/riscv_packed.h
index 07822f257630e..b201c1f1d3f0b 100644
--- a/clang/lib/Headers/riscv_packed.h
+++ b/clang/lib/Headers/riscv_packed.h
@@ -18,78 +18,78 @@ extern "C" {
 
 /* Packed SIMD Types */
 
-typedef int8_t int8x4_t __attribute__((vector_size(4)));
-typedef uint8_t uint8x4_t __attribute__((vector_size(4)));
-typedef int16_t int16x2_t __attribute__((vector_size(4)));
-typedef uint16_t uint16x2_t __attribute__((vector_size(4)));
+typedef int8_t int8x4_t __attribute__((__vector_size__(4), __aligned__(4)));
+typedef uint8_t uint8x4_t __attribute__((__vector_size__(4), __aligned__(4)));
+typedef int16_t int16x2_t __attribute__((__vector_size__(4), __aligned__(4)));
+typedef uint16_t uint16x2_t __attribute__((__vector_size__(4), __aligned__(4)));
 
-typedef int8_t int8x8_t __attribute__((vector_size(8)));
-typedef uint8_t uint8x8_t __attribute__((vector_size(8)));
-typedef int16_t int16x4_t __attribute__((vector_size(8)));
-typedef uint16_t uint16x4_t __attribute__((vector_size(8)));
-typedef int32_t int32x2_t __attribute__((vector_size(8)));
-typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
+typedef int8_t int8x8_t __attribute__((__vector_size__(8), __aligned__(8)));
+typedef uint8_t uint8x8_t __attribute__((__vector_size__(8), __aligned__(8)));
+typedef int16_t int16x4_t __attribute__((__vector_size__(8), __aligned__(8)));
+typedef uint16_t uint16x4_t __attribute__((__vector_size__(8), __aligned__(8)));
+typedef int32_t int32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
+typedef uint32_t uint32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
 
-#define _packed_binop(name, retty, ty1, ty2, op)                               \
+#define __packed_binop(name, retty, ty1, ty2, op)                              \
   static __inline__ retty __attribute__((__always_inline__, __nodebug__))      \
   __riscv_##name(ty1 __rs1, ty2 __rs2) {                                       \
     return __rs1 op __rs2;                                                     \
   }
 
-#define _packed_addsub(name, ty, op) _packed_binop(name, ty, ty, ty, op)
-#define _packed_shift(name, ty, op) _packed_binop(name, ty, ty, unsigned, op)
+#define __packed_addsub(name, ty, op) __packed_binop(name, ty, ty, ty, op)
+#define __packed_shift(name, ty, op) __packed_binop(name, ty, ty, unsigned, op)
 
 /* Packed Addition and Subtraction (32-bit) */
-_packed_addsub(padd_i8x4, int8x4_t, +)
-_packed_addsub(padd_u8x4, uint8x4_t, +)
-_packed_addsub(padd_i16x2, int16x2_t, +)
-_packed_addsub(padd_u16x2, uint16x2_t, +)
-_packed_addsub(psub_i8x4, int8x4_t, -)
-_packed_addsub(psub_u8x4, uint8x4_t, -)
-_packed_addsub(psub_i16x2, int16x2_t, -)
-_packed_addsub(psub_u16x2, uint16x2_t, -)
+__packed_addsub(padd_i8x4, int8x4_t, +)
+__packed_addsub(padd_u8x4, uint8x4_t, +)
+__packed_addsub(padd_i16x2, int16x2_t, +)
+__packed_addsub(padd_u16x2, uint16x2_t, +)
+__packed_addsub(psub_i8x4, int8x4_t, -)
+__packed_addsub(psub_u8x4, uint8x4_t, -)
+__packed_addsub(psub_i16x2, int16x2_t, -)
+__packed_addsub(psub_u16x2, uint16x2_t, -)
 
 /* Packed Addition and Subtraction (64-bit) */
-_packed_addsub(padd_i8x8, int8x8_t, +)
-_packed_addsub(padd_u8x8, uint8x8_t, +)
-_packed_addsub(padd_i16x4, int16x4_t, +)
-_packed_addsub(padd_u16x4, uint16x4_t, +)
-_packed_addsub(padd_i32x2, int32x2_t, +)
-_packed_addsub(padd_u32x2, uint32x2_t, +)
-_packed_addsub(psub_i8x8, int8x8_t, -)
-_packed_addsub(psub_u8x8, uint8x8_t, -)
-_packed_addsub(psub_i16x4, int16x4_t, -)
-_packed_addsub(psub_u16x4, uint16x4_t, -)
-_packed_addsub(psub_i32x2, int32x2_t, -)
-_packed_addsub(psub_u32x2, uint32x2_t, -)
+__packed_addsub(padd_i8x8, int8x8_t, +)
+__packed_addsub(padd_u8x8, uint8x8_t, +)
+__packed_addsub(padd_i16x4, int16x4_t, +)
+__packed_addsub(padd_u16x4, uint16x4_t, +)
+__packed_addsub(padd_i32x2, int32x2_t, +)
+__packed_addsub(padd_u32x2, uint32x2_t, +)
+__packed_addsub(psub_i8x8, int8x8_t, -)
+__packed_addsub(psub_u8x8, uint8x8_t, -)
+__packed_addsub(psub_i16x4, int16x4_t, -)
+__packed_addsub(psub_u16x4, uint16x4_t, -)
+__packed_addsub(psub_i32x2, int32x2_t, -)
+__packed_addsub(psub_u32x2, uint32x2_t, -)
 
 /* Packed Shifts (32-bit) */
-_packed_shift(psll_s_u8x4, uint8x4_t, <<)
-_packed_shift(psll_s_i8x4, int8x4_t, <<)
-_packed_shift(psll_s_u16x2, uint16x2_t, <<)
-_packed_shift(psll_s_i16x2, int16x2_t, <<)
-_packed_shift(psrl_s_u8x4, uint8x4_t, >>)
-_packed_shift(psrl_s_u16x2, uint16x2_t, >>)
-_packed_shift(psra_s_i8x4, int8x4_t, >>)
-_packed_shift(psra_s_i16x2, int16x2_t, >>)
+__packed_shift(psll_s_u8x4, uint8x4_t, <<)
+__packed_shift(psll_s_i8x4, int8x4_t, <<)
+__packed_shift(psll_s_u16x2, uint16x2_t, <<)
+__packed_shift(psll_s_i16x2, int16x2_t, <<)
+__packed_shift(psrl_s_u8x4, uint8x4_t, >>)
+__packed_shift(psrl_s_u16x2, uint16x2_t, >>)
+__packed_shift(psra_s_i8x4, int8x4_t, >>)
+__packed_shift(psra_s_i16x2, int16x2_t, >>)
 
 /* Packed Shifts (64-bit) */
-_packed_shift(psll_s_u8x8, uint8x8_t, <<)
-_packed_shift(psll_s_i8x8, int8x8_t, <<)
-_packed_shift(psll_s_u16x4, uint16x4_t, <<)
-_packed_shift(psll_s_i16x4, int16x4_t, <<)
-_packed_shift(psll_s_u32x2, uint32x2_t, <<)
-_packed_shift(psll_s_i32x2, int32x2_t, <<)
-_packed_shift(psrl_s_u8x8, uint8x8_t, >>)
-_packed_shift(psrl_s_u16x4, uint16x4_t, >>)
-_packed_shift(psrl_s_u32x2, uint32x2_t, >>)
-_packed_shift(psra_s_i8x8, int8x8_t, >>)
-_packed_shift(psra_s_i16x4, int16x4_t, >>)
-_packed_shift(psra_s_i32x2, int32x2_t, >>)
+__packed_shift(psll_s_u8x8, uint8x8_t, <<)
+__packed_shift(psll_s_i8x8, int8x8_t, <<)
+__packed_shift(psll_s_u16x4, uint16x4_t, <<)
+__packed_shift(psll_s_i16x4, int16x4_t, <<)
+__packed_shift(psll_s_u32x2, uint32x2_t, <<)
+__packed_shift(psll_s_i32x2, int32x2_t, <<)
+__packed_shift(psrl_s_u8x8, uint8x8_t, >>)
+__packed_shift(psrl_s_u16x4, uint16x4_t, >>)
+__packed_shift(psrl_s_u32x2, uint32x2_t, >>)
+__packed_shift(psra_s_i8x8, int8x8_t, >>)
+__packed_shift(psra_s_i16x4, int16x4_t, >>)
+__packed_shift(psra_s_i32x2, int32x2_t, >>)
 
-#undef _packed_addsub
-#undef _packed_shift
-#undef _packed_binop
+#undef __packed_addsub
+#undef __packed_shift
+#undef __packed_binop
 
 #if defined(__cplusplus)
 }

>From d905aeb42106d64d1c4302be1dbd7da2dd3d57a7 Mon Sep 17 00:00:00 2001
From: SiHuaN <liyongtai at iscas.ac.cn>
Date: Thu, 12 Mar 2026 19:44:17 +0800
Subject: [PATCH 8/8] [Clang][RISCV] Mask shift amounts in P extension
 intrinsics to avoid UB

---
 clang/lib/Headers/riscv_packed.h          |  60 +++--
 clang/test/CodeGen/RISCV/rvp-intrinsics.c | 312 ++++++++++++----------
 2 files changed, 210 insertions(+), 162 deletions(-)

diff --git a/clang/lib/Headers/riscv_packed.h b/clang/lib/Headers/riscv_packed.h
index b201c1f1d3f0b..50095bef7ddb3 100644
--- a/clang/lib/Headers/riscv_packed.h
+++ b/clang/lib/Headers/riscv_packed.h
@@ -30,14 +30,20 @@ typedef uint16_t uint16x4_t __attribute__((__vector_size__(8), __aligned__(8)));
 typedef int32_t int32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
 typedef uint32_t uint32x2_t __attribute__((__vector_size__(8), __aligned__(8)));
 
-#define __packed_binop(name, retty, ty1, ty2, op)                              \
-  static __inline__ retty __attribute__((__always_inline__, __nodebug__))      \
-  __riscv_##name(ty1 __rs1, ty2 __rs2) {                                       \
+#define __packed_addsub(name, ty, op)                                          \
+  static __inline__ ty __attribute__((__always_inline__, __nodebug__))         \
+  __riscv_##name(ty __rs1, ty __rs2) {                                         \
     return __rs1 op __rs2;                                                     \
   }
 
-#define __packed_addsub(name, ty, op) __packed_binop(name, ty, ty, ty, op)
-#define __packed_shift(name, ty, op) __packed_binop(name, ty, ty, unsigned, op)
+#define __packed_shift(name, ty, op, mask)                                     \
+  static __inline__ ty __attribute__((__always_inline__, __nodebug__))         \
+  __riscv_##name(ty __rs1, unsigned __rs2) {                                   \
+    return __rs1 op (__rs2 & (mask));                                          \
+  }
+#define __packed_shift8(name, ty, op) __packed_shift(name, ty, op, 0x7)
+#define __packed_shift16(name, ty, op) __packed_shift(name, ty, op, 0xf)
+#define __packed_shift32(name, ty, op) __packed_shift(name, ty, op, 0x1f)
 
 /* Packed Addition and Subtraction (32-bit) */
 __packed_addsub(padd_i8x4, int8x4_t, +)
@@ -64,32 +70,34 @@ __packed_addsub(psub_i32x2, int32x2_t, -)
 __packed_addsub(psub_u32x2, uint32x2_t, -)
 
 /* Packed Shifts (32-bit) */
-__packed_shift(psll_s_u8x4, uint8x4_t, <<)
-__packed_shift(psll_s_i8x4, int8x4_t, <<)
-__packed_shift(psll_s_u16x2, uint16x2_t, <<)
-__packed_shift(psll_s_i16x2, int16x2_t, <<)
-__packed_shift(psrl_s_u8x4, uint8x4_t, >>)
-__packed_shift(psrl_s_u16x2, uint16x2_t, >>)
-__packed_shift(psra_s_i8x4, int8x4_t, >>)
-__packed_shift(psra_s_i16x2, int16x2_t, >>)
+__packed_shift8(psll_s_u8x4, uint8x4_t, <<)
+__packed_shift8(psll_s_i8x4, int8x4_t, <<)
+__packed_shift16(psll_s_u16x2, uint16x2_t, <<)
+__packed_shift16(psll_s_i16x2, int16x2_t, <<)
+__packed_shift8(psrl_s_u8x4, uint8x4_t, >>)
+__packed_shift16(psrl_s_u16x2, uint16x2_t, >>)
+__packed_shift8(psra_s_i8x4, int8x4_t, >>)
+__packed_shift16(psra_s_i16x2, int16x2_t, >>)
 
 /* Packed Shifts (64-bit) */
-__packed_shift(psll_s_u8x8, uint8x8_t, <<)
-__packed_shift(psll_s_i8x8, int8x8_t, <<)
-__packed_shift(psll_s_u16x4, uint16x4_t, <<)
-__packed_shift(psll_s_i16x4, int16x4_t, <<)
-__packed_shift(psll_s_u32x2, uint32x2_t, <<)
-__packed_shift(psll_s_i32x2, int32x2_t, <<)
-__packed_shift(psrl_s_u8x8, uint8x8_t, >>)
-__packed_shift(psrl_s_u16x4, uint16x4_t, >>)
-__packed_shift(psrl_s_u32x2, uint32x2_t, >>)
-__packed_shift(psra_s_i8x8, int8x8_t, >>)
-__packed_shift(psra_s_i16x4, int16x4_t, >>)
-__packed_shift(psra_s_i32x2, int32x2_t, >>)
+__packed_shift8(psll_s_u8x8, uint8x8_t, <<)
+__packed_shift8(psll_s_i8x8, int8x8_t, <<)
+__packed_shift16(psll_s_u16x4, uint16x4_t, <<)
+__packed_shift16(psll_s_i16x4, int16x4_t, <<)
+__packed_shift32(psll_s_u32x2, uint32x2_t, <<)
+__packed_shift32(psll_s_i32x2, int32x2_t, <<)
+__packed_shift8(psrl_s_u8x8, uint8x8_t, >>)
+__packed_shift16(psrl_s_u16x4, uint16x4_t, >>)
+__packed_shift32(psrl_s_u32x2, uint32x2_t, >>)
+__packed_shift8(psra_s_i8x8, int8x8_t, >>)
+__packed_shift16(psra_s_i16x4, int16x4_t, >>)
+__packed_shift32(psra_s_i32x2, int32x2_t, >>)
 
 #undef __packed_addsub
 #undef __packed_shift
-#undef __packed_binop
+#undef __packed_shift8
+#undef __packed_shift16
+#undef __packed_shift32
 
 #if defined(__cplusplus)
 }
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index c80a6ad4e95e7..1c2899684ca39 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -483,11 +483,12 @@ uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_i8x4(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -495,11 +496,12 @@ uint32x2_t test_psub_u32x2(uint32x2_t a, uint32x2_t b) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
@@ -511,11 +513,12 @@ int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_u8x4(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -523,11 +526,12 @@ int8x4_t test_psll_s_i8x4(int8x4_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
@@ -539,11 +543,12 @@ uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_i16x2(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -551,11 +556,12 @@ uint8x4_t test_psll_s_u8x4(uint8x4_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
@@ -567,11 +573,12 @@ int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_u16x2(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -579,11 +586,12 @@ int16x2_t test_psll_s_i16x2(int16x2_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHL_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
@@ -595,11 +603,12 @@ uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psra_s_i8x4(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -607,11 +616,12 @@ uint16x2_t test_psll_s_u16x2(uint16x2_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
@@ -623,11 +633,12 @@ int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <4 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psrl_s_u8x4(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -635,11 +646,12 @@ int8x4_t test_psra_s_i8x4(int8x4_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <4 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i8> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
@@ -651,11 +663,12 @@ uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psra_s_i16x2(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -663,11 +676,12 @@ uint8x4_t test_psrl_s_u8x4(uint8x4_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
@@ -679,11 +693,12 @@ int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE]] to <2 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV32-NEXT:    ret i32 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV32-NEXT:    ret i32 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psrl_s_u16x2(
 // RV64-SAME: i64 [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
@@ -691,11 +706,12 @@ int16x2_t test_psra_s_i16x2(int16x2_t a, unsigned shamt) {
 // RV64-NEXT:    [[COERCE_VAL_II_I:%.*]] = trunc i64 [[A_COERCE]] to i32
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II_I]] to <2 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
-// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <2 x i16> [[SHR_I]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP4]] to i64
 // RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
 //
 uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
@@ -709,22 +725,24 @@ uint16x2_t test_psrl_s_u16x2(uint16x2_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_i8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
   return __riscv_psll_s_i8x8(a, shamt);
@@ -735,22 +753,24 @@ int8x8_t test_psll_s_i8x8(int8x8_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_u8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
   return __riscv_psll_s_u8x8(a, shamt);
@@ -761,22 +781,24 @@ uint8x8_t test_psll_s_u8x8(uint8x8_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_i16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
   return __riscv_psll_s_i16x4(a, shamt);
@@ -787,22 +809,24 @@ int16x4_t test_psll_s_i16x4(int16x4_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psll_s_u16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHL_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
   return __riscv_psll_s_u16x4(a, shamt);
@@ -812,7 +836,8 @@ uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
@@ -822,7 +847,8 @@ uint16x4_t test_psll_s_u16x4(uint16x4_t a, unsigned shamt) {
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
@@ -836,7 +862,8 @@ int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
@@ -846,7 +873,8 @@ int32x2_t test_psll_s_i32x2(int32x2_t a, unsigned shamt) {
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHL_I:%.*]] = shl <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHL_I]] to i64
@@ -861,22 +889,24 @@ uint32x2_t test_psll_s_u32x2(uint32x2_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psra_s_i8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = ashr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
   return __riscv_psra_s_i8x8(a, shamt);
@@ -887,22 +917,24 @@ int8x8_t test_psra_s_i8x8(int8x8_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psrl_s_u8x8(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <8 x i8>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i8
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], 7
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> poison, <8 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = lshr <8 x i8> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
   return __riscv_psrl_s_u8x8(a, shamt);
@@ -913,22 +945,24 @@ uint8x8_t test_psrl_s_u8x8(uint8x8_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psra_s_i16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = ashr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
   return __riscv_psra_s_i16x4(a, shamt);
@@ -939,22 +973,24 @@ int16x4_t test_psra_s_i16x4(int16x4_t a, unsigned shamt) {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV32-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV32-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV32-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV32-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV32-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV32-NEXT:    ret i64 [[TMP3]]
+// RV32-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV32-NEXT:    ret i64 [[TMP4]]
 //
 // RV64-LABEL: define dso_local i64 @test_psrl_s_u16x4(
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <4 x i16>
 // RV64-NEXT:    [[TMP1:%.*]] = trunc i32 [[SHAMT]] to i16
-// RV64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i64 0
-// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> zeroinitializer
+// RV64-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 15
+// RV64-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i64 0
+// RV64-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = lshr <4 x i16> [[TMP0]], [[SH_PROM_I]]
-// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
-// RV64-NEXT:    ret i64 [[TMP3]]
+// RV64-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[SHR_I]] to i64
+// RV64-NEXT:    ret i64 [[TMP4]]
 //
 uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
   return __riscv_psrl_s_u16x4(a, shamt);
@@ -964,7 +1000,8 @@ uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
@@ -974,7 +1011,8 @@ uint16x4_t test_psrl_s_u16x4(uint16x4_t a, unsigned shamt) {
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = ashr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
@@ -988,7 +1026,8 @@ int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
 // RV32-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV32-NEXT:  [[ENTRY:.*:]]
 // RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV32-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV32-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV32-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV32-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV32-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64
@@ -998,7 +1037,8 @@ int32x2_t test_psra_s_i32x2(int32x2_t a, unsigned shamt) {
 // RV64-SAME: i64 noundef [[A_COERCE:%.*]], i32 noundef signext [[SHAMT:%.*]]) #[[ATTR0]] {
 // RV64-NEXT:  [[ENTRY:.*:]]
 // RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE]] to <2 x i32>
-// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[SHAMT]], i64 0
+// RV64-NEXT:    [[AND_I:%.*]] = and i32 [[SHAMT]], 31
+// RV64-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i32> poison, i32 [[AND_I]], i64 0
 // RV64-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i32> [[SPLAT_SPLATINSERT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // RV64-NEXT:    [[SHR_I:%.*]] = lshr <2 x i32> [[TMP0]], [[SPLAT_SPLAT_I]]
 // RV64-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHR_I]] to i64



More information about the cfe-commits mailing list