[clang] [llvm] [RISCV] Preliminary P-ext intrinsics support (PR #174068)

Tue Dec 30 23:42:20 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: SiHuaN (sihuan)

<details>
<summary>Changes</summary>

This patch adds initial intrinsic support for the RISC-V P extension, introducing padd and psub operations.

The implementation is based on the `Packed Addition and Subtraction` section of the P extension intrinsic specification: 

---

Patch is 29.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174068.diff


9 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsRISCV.td (+17) 
- (modified) clang/lib/CodeGen/TargetBuiltins/RISCV.cpp (+28-3) 
- (added) clang/test/CodeGen/RISCV/rvp-intrinsics.c (+223) 
- (modified) llvm/include/llvm/IR/IntrinsicsRISCV.td (+12) 
- (modified) llvm/lib/Target/RISCV/RISCVCallingConv.cpp (+46-29) 
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+5) 
- (modified) llvm/lib/Target/RISCV/RISCVInstrInfoP.td (+36) 
- (modified) llvm/lib/Target/RISCV/RISCVRegisterInfo.td (+12-2) 
- (added) llvm/test/CodeGen/RISCV/rvp-intrinsics.ll (+135) 


``````````diff

diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td b/clang/include/clang/Basic/BuiltinsRISCV.td
index 2dad5ede2d64b..1c43371cd52fc 100644
--- a/clang/include/clang/Basic/BuiltinsRISCV.td
+++ b/clang/include/clang/Basic/BuiltinsRISCV.td
@@ -137,6 +137,23 @@ def sm3p0 : RISCVBuiltin<"unsigned int(unsigned int)">;
 def sm3p1 : RISCVBuiltin<"unsigned int(unsigned int)">;
 } // Features = "zksh"
 
+//===----------------------------------------------------------------------===//
+// P extension.
+//===----------------------------------------------------------------------===//
+let Features = "experimental-p" in {
+def padd_v4i8  : RISCVBuiltin<"_Vector<4, char>(_Vector<4, char>, _Vector<4, char>)">;
+def padd_v2i16 : RISCVBuiltin<"_Vector<2, short>(_Vector<2, short>, _Vector<2, short>)">;
+def padd_v8i8  : RISCVBuiltin<"_Vector<8, char>(_Vector<8, char>, _Vector<8, char>)">;
+def padd_v4i16 : RISCVBuiltin<"_Vector<4, short>(_Vector<4, short>, _Vector<4, short>)">;
+def padd_v2i32 : RISCVBuiltin<"_Vector<2, int>(_Vector<2, int>, _Vector<2, int>)">;
+
+def psub_v4i8  : RISCVBuiltin<"_Vector<4, char>(_Vector<4, char>, _Vector<4, char>)">;
+def psub_v2i16 : RISCVBuiltin<"_Vector<2, short>(_Vector<2, short>, _Vector<2, short>)">;
+def psub_v8i8  : RISCVBuiltin<"_Vector<8, char>(_Vector<8, char>, _Vector<8, char>)">;
+def psub_v4i16 : RISCVBuiltin<"_Vector<4, short>(_Vector<4, short>, _Vector<4, short>)">;
+def psub_v2i32 : RISCVBuiltin<"_Vector<2, int>(_Vector<2, int>, _Vector<2, int>)">;
+} // Features = "experimental-p"
+
 } // Attributes = [Const, NoThrow]
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
index 2e11037f0dcd0..8cc8b03db0137 100644
--- a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
@@ -1143,7 +1143,17 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
   case RISCV::BI__builtin_riscv_brev8_32:
   case RISCV::BI__builtin_riscv_brev8_64:
   case RISCV::BI__builtin_riscv_zip_32:
-  case RISCV::BI__builtin_riscv_unzip_32: {
+  case RISCV::BI__builtin_riscv_unzip_32:
+  case RISCV::BI__builtin_riscv_padd_v4i8:
+  case RISCV::BI__builtin_riscv_padd_v2i16:
+  case RISCV::BI__builtin_riscv_padd_v8i8:
+  case RISCV::BI__builtin_riscv_padd_v4i16:
+  case RISCV::BI__builtin_riscv_padd_v2i32:
+  case RISCV::BI__builtin_riscv_psub_v4i8:
+  case RISCV::BI__builtin_riscv_psub_v2i16:
+  case RISCV::BI__builtin_riscv_psub_v8i8:
+  case RISCV::BI__builtin_riscv_psub_v4i16:
+  case RISCV::BI__builtin_riscv_psub_v2i32: {
     switch (BuiltinID) {
     default: llvm_unreachable("unexpected builtin ID");
     // Zbb
@@ -1187,11 +1197,26 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
     case RISCV::BI__builtin_riscv_unzip_32:
       ID = Intrinsic::riscv_unzip;
       break;
-    }
 
+    // P extension
+    case RISCV::BI__builtin_riscv_padd_v4i8:
+    case RISCV::BI__builtin_riscv_padd_v2i16:
+    case RISCV::BI__builtin_riscv_padd_v8i8:
+    case RISCV::BI__builtin_riscv_padd_v4i16:
+    case RISCV::BI__builtin_riscv_padd_v2i32:
+      ID = Intrinsic::riscv_padd;
+      break;
+    case RISCV::BI__builtin_riscv_psub_v4i8:
+    case RISCV::BI__builtin_riscv_psub_v2i16:
+    case RISCV::BI__builtin_riscv_psub_v8i8:
+    case RISCV::BI__builtin_riscv_psub_v4i16:
+    case RISCV::BI__builtin_riscv_psub_v2i32:
+      ID = Intrinsic::riscv_psub;
+      break;
+    }
+  }
     IntrinsicTypes = {ResultType};
     break;
-  }
 
   // Zk builtins
 
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
new file mode 100644
index 0000000000000..2d047f2438e8b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -0,0 +1,223 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple riscv32 -target-feature +experimental-p -emit-llvm %s -O2 -o - | FileCheck %s --check-prefix=RV32
+// RUN: %clang_cc1 -triple riscv64 -target-feature +experimental-p -emit-llvm %s -O2 -o - | FileCheck %s --check-prefix=RV64
+
+#include <stdint.h>
+
+typedef int8_t v4i8 __attribute__((vector_size(4)));
+typedef int16_t v2i16 __attribute__((vector_size(4)));
+typedef int8_t v8i8 __attribute__((vector_size(8)));
+typedef int16_t v4i16 __attribute__((vector_size(8)));
+typedef int32_t v2i32 __attribute__((vector_size(8)));
+
+// RV32-LABEL: @test_padd_v4i8(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE:%.*]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE:%.*]] to <4 x i8>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <4 x i8> @llvm.riscv.padd.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[TMP2]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: @test_padd_v4i8(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[A_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1:%.*]] = trunc i64 [[B_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1]] to <4 x i8>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <4 x i8> @llvm.riscv.padd.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[TMP2]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+v4i8 test_padd_v4i8(v4i8 a, v4i8 b) {
+  return __builtin_riscv_padd_v4i8(a, b);
+}
+
+// RV32-LABEL: @test_padd_v2i16(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE:%.*]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE:%.*]] to <2 x i16>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <2 x i16> @llvm.riscv.padd.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: @test_padd_v2i16(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[A_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1:%.*]] = trunc i64 [[B_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1]] to <2 x i16>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <2 x i16> @llvm.riscv.padd.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+v2i16 test_padd_v2i16(v2i16 a, v2i16 b) {
+  return __builtin_riscv_padd_v2i16(a, b);
+}
+
+// RV32-LABEL: @test_padd_v8i8(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <8 x i8>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <8 x i8> @llvm.riscv.padd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: @test_padd_v8i8(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <8 x i8>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <8 x i8> @llvm.riscv.padd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+v8i8 test_padd_v8i8(v8i8 a, v8i8 b) {
+  return __builtin_riscv_padd_v8i8(a, b);
+}
+
+// RV32-LABEL: @test_padd_v4i16(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <4 x i16>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <4 x i16> @llvm.riscv.padd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: @test_padd_v4i16(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <4 x i16>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <4 x i16> @llvm.riscv.padd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+v4i16 test_padd_v4i16(v4i16 a, v4i16 b) {
+  return __builtin_riscv_padd_v4i16(a, b);
+}
+
+// RV32-LABEL: @test_padd_v2i32(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <2 x i32>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <2 x i32> @llvm.riscv.padd.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: @test_padd_v2i32(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <2 x i32>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <2 x i32> @llvm.riscv.padd.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+v2i32 test_padd_v2i32(v2i32 a, v2i32 b) {
+  return __builtin_riscv_padd_v2i32(a, b);
+}
+
+// RV32-LABEL: @test_psub_v4i8(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE:%.*]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE:%.*]] to <4 x i8>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <4 x i8> @llvm.riscv.psub.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[TMP2]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: @test_psub_v4i8(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[A_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II]] to <4 x i8>
+// RV64-NEXT:    [[COERCE_VAL_II1:%.*]] = trunc i64 [[B_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1]] to <4 x i8>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <4 x i8> @llvm.riscv.psub.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[TMP2]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+v4i8 test_psub_v4i8(v4i8 a, v4i8 b) {
+  return __builtin_riscv_psub_v4i8(a, b);
+}
+
+// RV32-LABEL: @test_psub_v2i16(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_COERCE:%.*]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i32 [[B_COERCE:%.*]] to <2 x i16>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <2 x i16> @llvm.riscv.psub.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to i32
+// RV32-NEXT:    ret i32 [[TMP3]]
+//
+// RV64-LABEL: @test_psub_v2i16(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[A_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[COERCE_VAL_II]] to <2 x i16>
+// RV64-NEXT:    [[COERCE_VAL_II1:%.*]] = trunc i64 [[B_COERCE:%.*]] to i32
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i32 [[COERCE_VAL_II1]] to <2 x i16>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <2 x i16> @llvm.riscv.psub.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to i32
+// RV64-NEXT:    [[RETVAL_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[TMP3]] to i64
+// RV64-NEXT:    ret i64 [[RETVAL_COERCE_0_INSERT_EXT]]
+//
+v2i16 test_psub_v2i16(v2i16 a, v2i16 b) {
+  return __builtin_riscv_psub_v2i16(a, b);
+}
+
+// RV32-LABEL: @test_psub_v8i8(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <8 x i8>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <8 x i8> @llvm.riscv.psub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: @test_psub_v8i8(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <8 x i8>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <8 x i8> @llvm.riscv.psub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+v8i8 test_psub_v8i8(v8i8 a, v8i8 b) {
+  return __builtin_riscv_psub_v8i8(a, b);
+}
+
+// RV32-LABEL: @test_psub_v4i16(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <4 x i16>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <4 x i16> @llvm.riscv.psub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: @test_psub_v4i16(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <4 x i16>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <4 x i16> @llvm.riscv.psub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+v4i16 test_psub_v4i16(v4i16 a, v4i16 b) {
+  return __builtin_riscv_psub_v4i16(a, b);
+}
+
+// RV32-LABEL: @test_psub_v2i32(
+// RV32-NEXT:  entry:
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <2 x i32>
+// RV32-NEXT:    [[TMP2:%.*]] = tail call <2 x i32> @llvm.riscv.psub.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV32-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
+// RV32-NEXT:    ret i64 [[TMP3]]
+//
+// RV64-LABEL: @test_psub_v2i32(
+// RV64-NEXT:  entry:
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_COERCE:%.*]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE:%.*]] to <2 x i32>
+// RV64-NEXT:    [[TMP2:%.*]] = tail call <2 x i32> @llvm.riscv.psub.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]])
+// RV64-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
+// RV64-NEXT:    ret i64 [[TMP3]]
+//
+v2i32 test_psub_v2i32(v2i32 a, v2i32 b) {
+  return __builtin_riscv_psub_v2i32(a, b);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 9088e5e6a357b..c35e09e372e89 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1978,6 +1978,18 @@ let TargetPrefix = "riscv" in {
   defm vfncvt_sat_f_f_q_alt : RISCVConversionRoundingMode;
 } // TargetPrefix = "riscv"
 
+//===----------------------------------------------------------------------===//
+// Packed SIMD (P) Extension
+
+let TargetPrefix = "riscv" in {
+  def int_riscv_padd : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                             [LLVMMatchType<0>, LLVMMatchType<0>],
+                                             [IntrNoMem]>;
+  def int_riscv_psub : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                             [LLVMMatchType<0>, LLVMMatchType<0>],
+                                             [IntrNoMem]>;
+}
+
 // Vendor extensions
 //===----------------------------------------------------------------------===//
 include "llvm/IR/IntrinsicsRISCVXTHead.td"
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index 78f47794a5b66..c8688d8aefaf3 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -545,37 +545,53 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
   unsigned StoreSizeBytes = XLen / 8;
   Align StackAlign = Align(XLen / 8);
 
+  static const MCPhysReg ArgGPRPairs[] = {RISCV::X10_X11, RISCV::X12_X13,
+                                          RISCV::X14_X15, RISCV::X16_X17};
+
   if (ValVT.isVector() || ValVT.isRISCVVectorTuple()) {
-    Reg = allocateRVVReg(ValVT, ValNo, State, TLI);
-    if (Reg) {
-      // Fixed-length vectors are located in the corresponding scalable-vector
-      // container types.
-      if (ValVT.isFixedLengthVector()) {
-        LocVT = TLI.getContainerForFixedLengthVector(LocVT);
-        State.addLoc(
-            CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-        return false;
-      }
-    } else {
-      // For return values, the vector must be passed fully via registers or
-      // via the stack.
-      // FIXME: The proposed vector ABI only mandates v8-v15 for return values,
-      // but we're using all of them.
-      if (IsRet)
-        return true;
-      // Try using a GPR to pass the address
-      if ((Reg = State.AllocateReg(ArgGPRs))) {
-        LocVT = XLenVT;
-        LocInfo = CCValAssign::Indirect;
-      } else if (ValVT.isScalableVector()) {
-        LocVT = XLenVT;
-        LocInfo = CCValAssign::Indirect;
+    bool IsPVectorInGPR = false;
+    if (Subtarget.enablePExtSIMDCodeGen() && ValVT.isVector()) {
+      const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT);
+      if (RC == &RISCV::GPRRegClass || RC == &RISCV::GPRPairRegClass)
+        IsPVectorInGPR = true;
+    }
+
+    if (!IsPVectorInGPR) {
+      Reg = allocateRVVReg(ValVT, ValNo, State, TLI);
+      if (Reg) {
+        // Fixed-length vectors are located in the corresponding scalable-vector
+        // container types.
+        if (ValVT.isFixedLengthVector()) {
+          LocVT = TLI.getContainerForFixedLengthVector(LocVT);
+          State.addLoc(
+              CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+          return false;
+        }
       } else {
-        StoreSizeBytes = ValVT.getStoreSize();
-        // Align vectors to their element sizes, being careful for vXi1
-        // vectors.
-        StackAlign = MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne();
+        // For return values, the vector must be passed fully via registers or
+        // via the stack.
+        // FIXME: The proposed vector ABI only mandates v8-v15 for return
+        // values, but we're using all of them.
+        if (IsRet)
+          return true;
+        // Try using a GPR to pass the address
+        if ((Reg = State.AllocateReg(ArgGPRs))) {
+          LocVT = XLenVT;
+          LocInfo = CCValAssign::Indirect;
+        } else if (ValVT.isScalableVector()) {
+          LocVT = XLenVT;
+          LocInfo = CCValAssign::Indirect;
+        } else {
+          StoreSizeBytes = ValVT.getStoreSize();
+          // Align vectors to their element sizes, being careful for vXi1
+          // vectors.
+          StackAlign = MaybeAlign(ValVT.getScalarSizeInBits() / 8).valueOrOne();
+        }
       }
+    } else if (XLen == 32 && ValVT.getSizeInBits() == 64) {
+      Reg = State.AllocateReg(ArgGPRPairs);
+    } else {
+      Reg = State.AllocateReg(ArgGPRs);
     }
   } else {
     Reg = State.AllocateReg(ArgGPRs);
@@ -604,7 +620,8 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
 
   assert(((ValVT.isFloatingPoint() && !ValVT.isVector()) || LocVT == XLenVT ||
           (TLI.getSubtarget().hasVInstructions() &&
-           (ValVT.isVector() || ValVT.isRISCVVectorTuple()))) &&
+           (ValVT.isVector() || ValVT.isRISCVVectorTuple())) ||
+          (Subtarget.enablePExtSIMDCodeGen() && ValVT.isVector())) &&
          "Expected an XLenVT or vector types at this stage");
 
   if (Reg) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c60f740d37576..d084c1cfdc8b4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -293,9 +293,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       addRegisterClass(MVT...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/174068