[llvm] [RISCV][llvm] Support div/rem codegen for P extension (PR #174801)

Brandon Wu via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 8 00:02:09 PST 2026


https://github.com/4vtomat updated https://github.com/llvm/llvm-project/pull/174801

>From d4c8819005b1cc672a4c8f6bc3eb4fbee3879053 Mon Sep 17 00:00:00 2001
From: Brandon Wu <brandon.wu at sifive.com>
Date: Wed, 7 Jan 2026 08:37:42 -0800
Subject: [PATCH] [RISCV][llvm] Support div/rem codegen for P extension

P extension doesn't have native instruction support for div/rem, so we
need to scalarize the code.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |   3 +
 llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll     | 401 +++++++++++++++++-
 llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll     | 435 +++++++++++++++++++-
 3 files changed, 836 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d6b62736bdf60..a1dd083561318 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -546,6 +546,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::SHL, ISD::SRL, ISD::SRA}, VTs, Custom);
     setOperationAction(ISD::BITCAST, VTs, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VTs, Custom);
+    setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM,
+                        ISD::SDIVREM, ISD::UDIVREM},
+                       VTs, Expand);
   }
 
   if (Subtarget.hasStdExtZfbfmin()) {
diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
index 2836cda16b6d9..837a24f164ca9 100644
--- a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -riscv-enable-p-ext-simd-codegen -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-RV32 %s
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -riscv-enable-p-ext-simd-codegen -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-RV64 %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p,+m -riscv-enable-p-ext-simd-codegen -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p,+m -riscv-enable-p-ext-simd-codegen -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-RV64 %s
 
 ; Test basic add/sub operations for v2i16
 define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
@@ -1118,3 +1118,400 @@ define void @test_pmulhsu_h_commuted(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
   store <2 x i16> %res, ptr %ret_ptr
   ret void
 }
+
+; Division and remainder tests
+define void @test_psdiv_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_psdiv_h:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a2)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    srai a3, a2, 16
+; CHECK-RV32-NEXT:    srai a4, a1, 16
+; CHECK-RV32-NEXT:    slli a2, a2, 16
+; CHECK-RV32-NEXT:    slli a1, a1, 16
+; CHECK-RV32-NEXT:    div a3, a4, a3
+; CHECK-RV32-NEXT:    srai a2, a2, 16
+; CHECK-RV32-NEXT:    srai a1, a1, 16
+; CHECK-RV32-NEXT:    div a1, a1, a2
+; CHECK-RV32-NEXT:    pack a1, a1, a3
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_psdiv_h:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a2)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    sraiw a3, a2, 16
+; CHECK-RV64-NEXT:    sraiw a4, a1, 16
+; CHECK-RV64-NEXT:    slli a2, a2, 48
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    divw a3, a4, a3
+; CHECK-RV64-NEXT:    srai a2, a2, 48
+; CHECK-RV64-NEXT:    srai a1, a1, 48
+; CHECK-RV64-NEXT:    divw a1, a1, a2
+; CHECK-RV64-NEXT:    ppaire.h a1, a1, a3
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = sdiv <2 x i16> %a, %b
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psdiv_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_psdiv_b:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a2)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    srai a3, a2, 24
+; CHECK-RV32-NEXT:    srai a4, a1, 24
+; CHECK-RV32-NEXT:    slli a5, a2, 16
+; CHECK-RV32-NEXT:    slli a6, a1, 16
+; CHECK-RV32-NEXT:    div a7, a4, a3
+; CHECK-RV32-NEXT:    srai a5, a5, 24
+; CHECK-RV32-NEXT:    srai a3, a6, 24
+; CHECK-RV32-NEXT:    div a6, a3, a5
+; CHECK-RV32-NEXT:    slli a3, a2, 8
+; CHECK-RV32-NEXT:    slli a4, a1, 8
+; CHECK-RV32-NEXT:    slli a2, a2, 24
+; CHECK-RV32-NEXT:    slli a1, a1, 24
+; CHECK-RV32-NEXT:    srai a3, a3, 24
+; CHECK-RV32-NEXT:    srai a4, a4, 24
+; CHECK-RV32-NEXT:    srai a2, a2, 24
+; CHECK-RV32-NEXT:    div a3, a4, a3
+; CHECK-RV32-NEXT:    srai a1, a1, 24
+; CHECK-RV32-NEXT:    div a2, a1, a2
+; CHECK-RV32-NEXT:    ppaire.db a2, a2, a6
+; CHECK-RV32-NEXT:    pack a1, a2, a3
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_psdiv_b:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a2)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    sraiw a3, a2, 24
+; CHECK-RV64-NEXT:    sraiw a4, a1, 24
+; CHECK-RV64-NEXT:    slli a5, a2, 40
+; CHECK-RV64-NEXT:    slli a6, a1, 40
+; CHECK-RV64-NEXT:    divw a3, a4, a3
+; CHECK-RV64-NEXT:    slli a4, a2, 48
+; CHECK-RV64-NEXT:    srai a5, a5, 56
+; CHECK-RV64-NEXT:    srai a6, a6, 56
+; CHECK-RV64-NEXT:    divw a5, a6, a5
+; CHECK-RV64-NEXT:    slli a6, a1, 48
+; CHECK-RV64-NEXT:    srai a4, a4, 56
+; CHECK-RV64-NEXT:    srai a6, a6, 56
+; CHECK-RV64-NEXT:    divw a4, a6, a4
+; CHECK-RV64-NEXT:    slli a2, a2, 56
+; CHECK-RV64-NEXT:    slli a1, a1, 56
+; CHECK-RV64-NEXT:    srai a2, a2, 56
+; CHECK-RV64-NEXT:    srai a1, a1, 56
+; CHECK-RV64-NEXT:    divw a1, a1, a2
+; CHECK-RV64-NEXT:    ppaire.b a2, a5, a3
+; CHECK-RV64-NEXT:    ppaire.b a1, a1, a4
+; CHECK-RV64-NEXT:    ppaire.h a1, a1, a2
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = sdiv <4 x i8> %a, %b
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pudiv_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_pudiv_h:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a2)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    lui a3, 16
+; CHECK-RV32-NEXT:    addi a3, a3, -1
+; CHECK-RV32-NEXT:    and a4, a2, a3
+; CHECK-RV32-NEXT:    and a3, a1, a3
+; CHECK-RV32-NEXT:    srli a2, a2, 16
+; CHECK-RV32-NEXT:    srli a1, a1, 16
+; CHECK-RV32-NEXT:    divu a3, a3, a4
+; CHECK-RV32-NEXT:    divu a1, a1, a2
+; CHECK-RV32-NEXT:    pack a1, a3, a1
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_pudiv_h:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a2)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    lui a3, 16
+; CHECK-RV64-NEXT:    addi a3, a3, -1
+; CHECK-RV64-NEXT:    and a4, a2, a3
+; CHECK-RV64-NEXT:    and a3, a1, a3
+; CHECK-RV64-NEXT:    srliw a2, a2, 16
+; CHECK-RV64-NEXT:    srliw a1, a1, 16
+; CHECK-RV64-NEXT:    divuw a3, a3, a4
+; CHECK-RV64-NEXT:    divuw a1, a1, a2
+; CHECK-RV64-NEXT:    ppaire.h a1, a3, a1
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = udiv <2 x i16> %a, %b
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pudiv_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_pudiv_b:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a2)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    srli a3, a2, 24
+; CHECK-RV32-NEXT:    srli a4, a1, 24
+; CHECK-RV32-NEXT:    slli a5, a2, 16
+; CHECK-RV32-NEXT:    slli a6, a1, 16
+; CHECK-RV32-NEXT:    divu a7, a4, a3
+; CHECK-RV32-NEXT:    srli a5, a5, 24
+; CHECK-RV32-NEXT:    srli a3, a6, 24
+; CHECK-RV32-NEXT:    divu a6, a3, a5
+; CHECK-RV32-NEXT:    zext.b a3, a2
+; CHECK-RV32-NEXT:    zext.b a4, a1
+; CHECK-RV32-NEXT:    slli a2, a2, 8
+; CHECK-RV32-NEXT:    slli a1, a1, 8
+; CHECK-RV32-NEXT:    divu a4, a4, a3
+; CHECK-RV32-NEXT:    srli a2, a2, 24
+; CHECK-RV32-NEXT:    srli a1, a1, 24
+; CHECK-RV32-NEXT:    divu a5, a1, a2
+; CHECK-RV32-NEXT:    ppaire.db a2, a4, a6
+; CHECK-RV32-NEXT:    pack a1, a2, a3
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_pudiv_b:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a2)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    srliw a3, a2, 24
+; CHECK-RV64-NEXT:    srliw a4, a1, 24
+; CHECK-RV64-NEXT:    slli a5, a2, 40
+; CHECK-RV64-NEXT:    zext.b a6, a2
+; CHECK-RV64-NEXT:    divuw a3, a4, a3
+; CHECK-RV64-NEXT:    zext.b a4, a1
+; CHECK-RV64-NEXT:    divuw a4, a4, a6
+; CHECK-RV64-NEXT:    slli a6, a1, 40
+; CHECK-RV64-NEXT:    srli a5, a5, 56
+; CHECK-RV64-NEXT:    srli a6, a6, 56
+; CHECK-RV64-NEXT:    divuw a5, a6, a5
+; CHECK-RV64-NEXT:    slli a2, a2, 48
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    srli a2, a2, 56
+; CHECK-RV64-NEXT:    srli a1, a1, 56
+; CHECK-RV64-NEXT:    divuw a1, a1, a2
+; CHECK-RV64-NEXT:    ppaire.b a2, a5, a3
+; CHECK-RV64-NEXT:    ppaire.b a1, a4, a1
+; CHECK-RV64-NEXT:    ppaire.h a1, a1, a2
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = udiv <4 x i8> %a, %b
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrem_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_psrem_h:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a2)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    srai a3, a2, 16
+; CHECK-RV32-NEXT:    srai a4, a1, 16
+; CHECK-RV32-NEXT:    slli a2, a2, 16
+; CHECK-RV32-NEXT:    slli a1, a1, 16
+; CHECK-RV32-NEXT:    rem a3, a4, a3
+; CHECK-RV32-NEXT:    srai a2, a2, 16
+; CHECK-RV32-NEXT:    srai a1, a1, 16
+; CHECK-RV32-NEXT:    rem a1, a1, a2
+; CHECK-RV32-NEXT:    pack a1, a1, a3
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_psrem_h:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a2)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    sraiw a3, a2, 16
+; CHECK-RV64-NEXT:    sraiw a4, a1, 16
+; CHECK-RV64-NEXT:    slli a2, a2, 48
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    remw a3, a4, a3
+; CHECK-RV64-NEXT:    srai a2, a2, 48
+; CHECK-RV64-NEXT:    srai a1, a1, 48
+; CHECK-RV64-NEXT:    remw a1, a1, a2
+; CHECK-RV64-NEXT:    ppaire.h a1, a1, a3
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = srem <2 x i16> %a, %b
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrem_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_psrem_b:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a2)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    srai a3, a2, 24
+; CHECK-RV32-NEXT:    srai a4, a1, 24
+; CHECK-RV32-NEXT:    slli a5, a2, 16
+; CHECK-RV32-NEXT:    slli a6, a1, 16
+; CHECK-RV32-NEXT:    rem a7, a4, a3
+; CHECK-RV32-NEXT:    srai a5, a5, 24
+; CHECK-RV32-NEXT:    srai a3, a6, 24
+; CHECK-RV32-NEXT:    rem a6, a3, a5
+; CHECK-RV32-NEXT:    slli a3, a2, 8
+; CHECK-RV32-NEXT:    slli a4, a1, 8
+; CHECK-RV32-NEXT:    slli a2, a2, 24
+; CHECK-RV32-NEXT:    slli a1, a1, 24
+; CHECK-RV32-NEXT:    srai a3, a3, 24
+; CHECK-RV32-NEXT:    srai a4, a4, 24
+; CHECK-RV32-NEXT:    srai a2, a2, 24
+; CHECK-RV32-NEXT:    rem a3, a4, a3
+; CHECK-RV32-NEXT:    srai a1, a1, 24
+; CHECK-RV32-NEXT:    rem a2, a1, a2
+; CHECK-RV32-NEXT:    ppaire.db a2, a2, a6
+; CHECK-RV32-NEXT:    pack a1, a2, a3
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_psrem_b:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a2)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    sraiw a3, a2, 24
+; CHECK-RV64-NEXT:    sraiw a4, a1, 24
+; CHECK-RV64-NEXT:    slli a5, a2, 40
+; CHECK-RV64-NEXT:    slli a6, a1, 40
+; CHECK-RV64-NEXT:    remw a3, a4, a3
+; CHECK-RV64-NEXT:    slli a4, a2, 48
+; CHECK-RV64-NEXT:    srai a5, a5, 56
+; CHECK-RV64-NEXT:    srai a6, a6, 56
+; CHECK-RV64-NEXT:    remw a5, a6, a5
+; CHECK-RV64-NEXT:    slli a6, a1, 48
+; CHECK-RV64-NEXT:    srai a4, a4, 56
+; CHECK-RV64-NEXT:    srai a6, a6, 56
+; CHECK-RV64-NEXT:    remw a4, a6, a4
+; CHECK-RV64-NEXT:    slli a2, a2, 56
+; CHECK-RV64-NEXT:    slli a1, a1, 56
+; CHECK-RV64-NEXT:    srai a2, a2, 56
+; CHECK-RV64-NEXT:    srai a1, a1, 56
+; CHECK-RV64-NEXT:    remw a1, a1, a2
+; CHECK-RV64-NEXT:    ppaire.b a2, a5, a3
+; CHECK-RV64-NEXT:    ppaire.b a1, a1, a4
+; CHECK-RV64-NEXT:    ppaire.h a1, a1, a2
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = srem <4 x i8> %a, %b
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_purem_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_purem_h:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a2)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    lui a3, 16
+; CHECK-RV32-NEXT:    addi a3, a3, -1
+; CHECK-RV32-NEXT:    and a4, a2, a3
+; CHECK-RV32-NEXT:    and a3, a1, a3
+; CHECK-RV32-NEXT:    srli a2, a2, 16
+; CHECK-RV32-NEXT:    srli a1, a1, 16
+; CHECK-RV32-NEXT:    remu a3, a3, a4
+; CHECK-RV32-NEXT:    remu a1, a1, a2
+; CHECK-RV32-NEXT:    pack a1, a3, a1
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_purem_h:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a2)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    lui a3, 16
+; CHECK-RV64-NEXT:    addi a3, a3, -1
+; CHECK-RV64-NEXT:    and a4, a2, a3
+; CHECK-RV64-NEXT:    and a3, a1, a3
+; CHECK-RV64-NEXT:    srliw a2, a2, 16
+; CHECK-RV64-NEXT:    srliw a1, a1, 16
+; CHECK-RV64-NEXT:    remuw a3, a3, a4
+; CHECK-RV64-NEXT:    remuw a1, a1, a2
+; CHECK-RV64-NEXT:    ppaire.h a1, a3, a1
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = urem <2 x i16> %a, %b
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_purem_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-RV32-LABEL: test_purem_b:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    lw a2, 0(a2)
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    srli a3, a2, 24
+; CHECK-RV32-NEXT:    srli a4, a1, 24
+; CHECK-RV32-NEXT:    slli a5, a2, 16
+; CHECK-RV32-NEXT:    slli a6, a1, 16
+; CHECK-RV32-NEXT:    remu a7, a4, a3
+; CHECK-RV32-NEXT:    srli a5, a5, 24
+; CHECK-RV32-NEXT:    srli a3, a6, 24
+; CHECK-RV32-NEXT:    remu a6, a3, a5
+; CHECK-RV32-NEXT:    zext.b a3, a2
+; CHECK-RV32-NEXT:    zext.b a4, a1
+; CHECK-RV32-NEXT:    slli a2, a2, 8
+; CHECK-RV32-NEXT:    slli a1, a1, 8
+; CHECK-RV32-NEXT:    remu a4, a4, a3
+; CHECK-RV32-NEXT:    srli a2, a2, 24
+; CHECK-RV32-NEXT:    srli a1, a1, 24
+; CHECK-RV32-NEXT:    remu a5, a1, a2
+; CHECK-RV32-NEXT:    ppaire.db a2, a4, a6
+; CHECK-RV32-NEXT:    pack a1, a2, a3
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_purem_b:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    lw a2, 0(a2)
+; CHECK-RV64-NEXT:    lw a1, 0(a1)
+; CHECK-RV64-NEXT:    srliw a3, a2, 24
+; CHECK-RV64-NEXT:    srliw a4, a1, 24
+; CHECK-RV64-NEXT:    slli a5, a2, 40
+; CHECK-RV64-NEXT:    zext.b a6, a2
+; CHECK-RV64-NEXT:    remuw a3, a4, a3
+; CHECK-RV64-NEXT:    zext.b a4, a1
+; CHECK-RV64-NEXT:    remuw a4, a4, a6
+; CHECK-RV64-NEXT:    slli a6, a1, 40
+; CHECK-RV64-NEXT:    srli a5, a5, 56
+; CHECK-RV64-NEXT:    srli a6, a6, 56
+; CHECK-RV64-NEXT:    remuw a5, a6, a5
+; CHECK-RV64-NEXT:    slli a2, a2, 48
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    srli a2, a2, 56
+; CHECK-RV64-NEXT:    srli a1, a1, 56
+; CHECK-RV64-NEXT:    remuw a1, a1, a2
+; CHECK-RV64-NEXT:    ppaire.b a2, a5, a3
+; CHECK-RV64-NEXT:    ppaire.b a1, a4, a1
+; CHECK-RV64-NEXT:    ppaire.h a1, a1, a2
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = urem <4 x i8> %a, %b
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
index dfa1b242e656f..702caba23ca96 100644
--- a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -riscv-enable-p-ext-simd-codegen -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p,+m -riscv-enable-p-ext-simd-codegen -verify-machineinstrs < %s | FileCheck %s
 
 ; Test basic add/sub operations for v4i16
 define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
@@ -1148,3 +1148,436 @@ define void @test_pmulhsu_w_commuted(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
   store <2 x i32> %res, ptr %ret_ptr
   ret void
 }
+
+; Division and remainder tests
+define void @test_psdiv_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psdiv_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    srai a3, a2, 48
+; CHECK-NEXT:    srai a4, a1, 48
+; CHECK-NEXT:    slli a5, a2, 16
+; CHECK-NEXT:    sraiw a6, a2, 16
+; CHECK-NEXT:    divw a3, a4, a3
+; CHECK-NEXT:    sraiw a4, a1, 16
+; CHECK-NEXT:    divw a4, a4, a6
+; CHECK-NEXT:    slli a6, a1, 16
+; CHECK-NEXT:    srai a5, a5, 48
+; CHECK-NEXT:    srai a6, a6, 48
+; CHECK-NEXT:    divw a5, a6, a5
+; CHECK-NEXT:    slli a2, a2, 48
+; CHECK-NEXT:    slli a1, a1, 48
+; CHECK-NEXT:    srai a2, a2, 48
+; CHECK-NEXT:    srai a1, a1, 48
+; CHECK-NEXT:    divw a1, a1, a2
+; CHECK-NEXT:    ppaire.h a2, a5, a3
+; CHECK-NEXT:    ppaire.h a1, a1, a4
+; CHECK-NEXT:    pack a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = sdiv <4 x i16> %a, %b
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psdiv_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psdiv_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    srai a3, a2, 56
+; CHECK-NEXT:    srai a4, a1, 56
+; CHECK-NEXT:    slli a5, a2, 8
+; CHECK-NEXT:    slli a6, a1, 8
+; CHECK-NEXT:    slli a7, a2, 16
+; CHECK-NEXT:    slli t0, a1, 16
+; CHECK-NEXT:    slli t1, a2, 24
+; CHECK-NEXT:    sraiw t2, a2, 24
+; CHECK-NEXT:    divw a3, a4, a3
+; CHECK-NEXT:    sraiw a4, a1, 24
+; CHECK-NEXT:    divw a4, a4, t2
+; CHECK-NEXT:    slli t2, a1, 24
+; CHECK-NEXT:    srai a5, a5, 56
+; CHECK-NEXT:    srai a6, a6, 56
+; CHECK-NEXT:    divw a5, a6, a5
+; CHECK-NEXT:    slli a6, a2, 40
+; CHECK-NEXT:    srai a7, a7, 56
+; CHECK-NEXT:    srai t0, t0, 56
+; CHECK-NEXT:    divw a7, t0, a7
+; CHECK-NEXT:    slli t0, a1, 40
+; CHECK-NEXT:    srai t1, t1, 56
+; CHECK-NEXT:    srai t2, t2, 56
+; CHECK-NEXT:    divw t1, t2, t1
+; CHECK-NEXT:    slli t2, a2, 48
+; CHECK-NEXT:    srai a6, a6, 56
+; CHECK-NEXT:    srai t0, t0, 56
+; CHECK-NEXT:    divw a6, t0, a6
+; CHECK-NEXT:    slli t0, a1, 48
+; CHECK-NEXT:    srai t2, t2, 56
+; CHECK-NEXT:    srai t0, t0, 56
+; CHECK-NEXT:    divw t0, t0, t2
+; CHECK-NEXT:    slli a2, a2, 56
+; CHECK-NEXT:    slli a1, a1, 56
+; CHECK-NEXT:    srai a2, a2, 56
+; CHECK-NEXT:    srai a1, a1, 56
+; CHECK-NEXT:    divw a1, a1, a2
+; CHECK-NEXT:    ppaire.b a2, a5, a3
+; CHECK-NEXT:    ppaire.b a3, t1, a7
+; CHECK-NEXT:    ppaire.b a4, a6, a4
+; CHECK-NEXT:    ppaire.b a1, a1, t0
+; CHECK-NEXT:    ppaire.h a2, a3, a2
+; CHECK-NEXT:    ppaire.h a1, a1, a4
+; CHECK-NEXT:    pack a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = sdiv <8 x i8> %a, %b
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psdiv_w(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psdiv_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    divw a3, a1, a2
+; CHECK-NEXT:    srli a2, a2, 32
+; CHECK-NEXT:    srli a1, a1, 32
+; CHECK-NEXT:    divw a1, a1, a2
+; CHECK-NEXT:    pack a1, a3, a1
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  %b = load <2 x i32>, ptr %b_ptr
+  %res = sdiv <2 x i32> %a, %b
+  store <2 x i32> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pudiv_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pudiv_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    lui a3, 16
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a4, a2, a3
+; CHECK-NEXT:    and a3, a1, a3
+; CHECK-NEXT:    srliw a5, a2, 16
+; CHECK-NEXT:    srliw a6, a1, 16
+; CHECK-NEXT:    divuw a3, a3, a4
+; CHECK-NEXT:    srli a4, a2, 48
+; CHECK-NEXT:    divuw a5, a6, a5
+; CHECK-NEXT:    srli a6, a1, 48
+; CHECK-NEXT:    divuw a4, a6, a4
+; CHECK-NEXT:    slli a2, a2, 16
+; CHECK-NEXT:    slli a1, a1, 16
+; CHECK-NEXT:    srli a2, a2, 48
+; CHECK-NEXT:    srli a1, a1, 48
+; CHECK-NEXT:    divuw a1, a1, a2
+; CHECK-NEXT:    ppaire.h a2, a3, a5
+; CHECK-NEXT:    ppaire.h a1, a1, a4
+; CHECK-NEXT:    pack a1, a2, a1
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = udiv <4 x i16> %a, %b
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pudiv_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pudiv_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    srli a3, a2, 56
+; CHECK-NEXT:    srli a4, a1, 56
+; CHECK-NEXT:    slli a5, a2, 8
+; CHECK-NEXT:    slli a6, a1, 8
+; CHECK-NEXT:    slli a7, a2, 16
+; CHECK-NEXT:    slli t0, a1, 16
+; CHECK-NEXT:    srliw t1, a2, 24
+; CHECK-NEXT:    srliw t2, a1, 24
+; CHECK-NEXT:    divuw a3, a4, a3
+; CHECK-NEXT:    zext.b a4, a2
+; CHECK-NEXT:    divuw t1, t2, t1
+; CHECK-NEXT:    zext.b t2, a1
+; CHECK-NEXT:    divuw a4, t2, a4
+; CHECK-NEXT:    slli t2, a2, 24
+; CHECK-NEXT:    srli a5, a5, 56
+; CHECK-NEXT:    srli a6, a6, 56
+; CHECK-NEXT:    divuw a5, a6, a5
+; CHECK-NEXT:    slli a6, a1, 24
+; CHECK-NEXT:    srli a7, a7, 56
+; CHECK-NEXT:    srli t0, t0, 56
+; CHECK-NEXT:    divuw a7, t0, a7
+; CHECK-NEXT:    slli t0, a2, 40
+; CHECK-NEXT:    srli t2, t2, 56
+; CHECK-NEXT:    srli a6, a6, 56
+; CHECK-NEXT:    divuw a6, a6, t2
+; CHECK-NEXT:    slli t2, a1, 40
+; CHECK-NEXT:    srli t0, t0, 56
+; CHECK-NEXT:    srli t2, t2, 56
+; CHECK-NEXT:    divuw t0, t2, t0
+; CHECK-NEXT:    slli a2, a2, 48
+; CHECK-NEXT:    slli a1, a1, 48
+; CHECK-NEXT:    srli a2, a2, 56
+; CHECK-NEXT:    srli a1, a1, 56
+; CHECK-NEXT:    divuw a1, a1, a2
+; CHECK-NEXT:    ppaire.b a2, a5, a3
+; CHECK-NEXT:    ppaire.b a3, a6, a7
+; CHECK-NEXT:    ppaire.b a5, t0, t1
+; CHECK-NEXT:    ppaire.b a1, a4, a1
+; CHECK-NEXT:    ppaire.h a2, a3, a2
+; CHECK-NEXT:    ppaire.h a1, a1, a5
+; CHECK-NEXT:    pack a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = udiv <8 x i8> %a, %b
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pudiv_w(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pudiv_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    divuw a3, a1, a2
+; CHECK-NEXT:    srli a2, a2, 32
+; CHECK-NEXT:    srli a1, a1, 32
+; CHECK-NEXT:    divuw a1, a1, a2
+; CHECK-NEXT:    pack a1, a3, a1
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  %b = load <2 x i32>, ptr %b_ptr
+  %res = udiv <2 x i32> %a, %b
+  store <2 x i32> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrem_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psrem_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    srai a3, a2, 48
+; CHECK-NEXT:    srai a4, a1, 48
+; CHECK-NEXT:    slli a5, a2, 16
+; CHECK-NEXT:    sraiw a6, a2, 16
+; CHECK-NEXT:    remw a3, a4, a3
+; CHECK-NEXT:    sraiw a4, a1, 16
+; CHECK-NEXT:    remw a4, a4, a6
+; CHECK-NEXT:    slli a6, a1, 16
+; CHECK-NEXT:    srai a5, a5, 48
+; CHECK-NEXT:    srai a6, a6, 48
+; CHECK-NEXT:    remw a5, a6, a5
+; CHECK-NEXT:    slli a2, a2, 48
+; CHECK-NEXT:    slli a1, a1, 48
+; CHECK-NEXT:    srai a2, a2, 48
+; CHECK-NEXT:    srai a1, a1, 48
+; CHECK-NEXT:    remw a1, a1, a2
+; CHECK-NEXT:    ppaire.h a2, a5, a3
+; CHECK-NEXT:    ppaire.h a1, a1, a4
+; CHECK-NEXT:    pack a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = srem <4 x i16> %a, %b
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrem_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psrem_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    srai a3, a2, 56
+; CHECK-NEXT:    srai a4, a1, 56
+; CHECK-NEXT:    slli a5, a2, 8
+; CHECK-NEXT:    slli a6, a1, 8
+; CHECK-NEXT:    slli a7, a2, 16
+; CHECK-NEXT:    slli t0, a1, 16
+; CHECK-NEXT:    slli t1, a2, 24
+; CHECK-NEXT:    sraiw t2, a2, 24
+; CHECK-NEXT:    remw a3, a4, a3
+; CHECK-NEXT:    sraiw a4, a1, 24
+; CHECK-NEXT:    remw a4, a4, t2
+; CHECK-NEXT:    slli t2, a1, 24
+; CHECK-NEXT:    srai a5, a5, 56
+; CHECK-NEXT:    srai a6, a6, 56
+; CHECK-NEXT:    remw a5, a6, a5
+; CHECK-NEXT:    slli a6, a2, 40
+; CHECK-NEXT:    srai a7, a7, 56
+; CHECK-NEXT:    srai t0, t0, 56
+; CHECK-NEXT:    remw a7, t0, a7
+; CHECK-NEXT:    slli t0, a1, 40
+; CHECK-NEXT:    srai t1, t1, 56
+; CHECK-NEXT:    srai t2, t2, 56
+; CHECK-NEXT:    remw t1, t2, t1
+; CHECK-NEXT:    slli t2, a2, 48
+; CHECK-NEXT:    srai a6, a6, 56
+; CHECK-NEXT:    srai t0, t0, 56
+; CHECK-NEXT:    remw a6, t0, a6
+; CHECK-NEXT:    slli t0, a1, 48
+; CHECK-NEXT:    srai t2, t2, 56
+; CHECK-NEXT:    srai t0, t0, 56
+; CHECK-NEXT:    remw t0, t0, t2
+; CHECK-NEXT:    slli a2, a2, 56
+; CHECK-NEXT:    slli a1, a1, 56
+; CHECK-NEXT:    srai a2, a2, 56
+; CHECK-NEXT:    srai a1, a1, 56
+; CHECK-NEXT:    remw a1, a1, a2
+; CHECK-NEXT:    ppaire.b a2, a5, a3
+; CHECK-NEXT:    ppaire.b a3, t1, a7
+; CHECK-NEXT:    ppaire.b a4, a6, a4
+; CHECK-NEXT:    ppaire.b a1, a1, t0
+; CHECK-NEXT:    ppaire.h a2, a3, a2
+; CHECK-NEXT:    ppaire.h a1, a1, a4
+; CHECK-NEXT:    pack a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = srem <8 x i8> %a, %b
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psrem_w(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psrem_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    remw a3, a1, a2
+; CHECK-NEXT:    srai a2, a2, 32
+; CHECK-NEXT:    srai a1, a1, 32
+; CHECK-NEXT:    remw a1, a1, a2
+; CHECK-NEXT:    pack a1, a3, a1
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  %b = load <2 x i32>, ptr %b_ptr
+  %res = srem <2 x i32> %a, %b
+  store <2 x i32> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_purem_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_purem_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    lui a3, 16
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a4, a2, a3
+; CHECK-NEXT:    and a3, a1, a3
+; CHECK-NEXT:    srliw a5, a2, 16
+; CHECK-NEXT:    srliw a6, a1, 16
+; CHECK-NEXT:    remuw a3, a3, a4
+; CHECK-NEXT:    srli a4, a2, 48
+; CHECK-NEXT:    remuw a5, a6, a5
+; CHECK-NEXT:    srli a6, a1, 48
+; CHECK-NEXT:    remuw a4, a6, a4
+; CHECK-NEXT:    slli a2, a2, 16
+; CHECK-NEXT:    slli a1, a1, 16
+; CHECK-NEXT:    srli a2, a2, 48
+; CHECK-NEXT:    srli a1, a1, 48
+; CHECK-NEXT:    remuw a1, a1, a2
+; CHECK-NEXT:    ppaire.h a2, a3, a5
+; CHECK-NEXT:    ppaire.h a1, a1, a4
+; CHECK-NEXT:    pack a1, a2, a1
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = urem <4 x i16> %a, %b
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_purem_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_purem_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    srli a3, a2, 56
+; CHECK-NEXT:    srli a4, a1, 56
+; CHECK-NEXT:    slli a5, a2, 8
+; CHECK-NEXT:    slli a6, a1, 8
+; CHECK-NEXT:    slli a7, a2, 16
+; CHECK-NEXT:    slli t0, a1, 16
+; CHECK-NEXT:    srliw t1, a2, 24
+; CHECK-NEXT:    srliw t2, a1, 24
+; CHECK-NEXT:    remuw a3, a4, a3
+; CHECK-NEXT:    zext.b a4, a2
+; CHECK-NEXT:    remuw t1, t2, t1
+; CHECK-NEXT:    zext.b t2, a1
+; CHECK-NEXT:    remuw a4, t2, a4
+; CHECK-NEXT:    slli t2, a2, 24
+; CHECK-NEXT:    srli a5, a5, 56
+; CHECK-NEXT:    srli a6, a6, 56
+; CHECK-NEXT:    remuw a5, a6, a5
+; CHECK-NEXT:    slli a6, a1, 24
+; CHECK-NEXT:    srli a7, a7, 56
+; CHECK-NEXT:    srli t0, t0, 56
+; CHECK-NEXT:    remuw a7, t0, a7
+; CHECK-NEXT:    slli t0, a2, 40
+; CHECK-NEXT:    srli t2, t2, 56
+; CHECK-NEXT:    srli a6, a6, 56
+; CHECK-NEXT:    remuw a6, a6, t2
+; CHECK-NEXT:    slli t2, a1, 40
+; CHECK-NEXT:    srli t0, t0, 56
+; CHECK-NEXT:    srli t2, t2, 56
+; CHECK-NEXT:    remuw t0, t2, t0
+; CHECK-NEXT:    slli a2, a2, 48
+; CHECK-NEXT:    slli a1, a1, 48
+; CHECK-NEXT:    srli a2, a2, 56
+; CHECK-NEXT:    srli a1, a1, 56
+; CHECK-NEXT:    remuw a1, a1, a2
+; CHECK-NEXT:    ppaire.b a2, a5, a3
+; CHECK-NEXT:    ppaire.b a3, a6, a7
+; CHECK-NEXT:    ppaire.b a5, t0, t1
+; CHECK-NEXT:    ppaire.b a1, a4, a1
+; CHECK-NEXT:    ppaire.h a2, a3, a2
+; CHECK-NEXT:    ppaire.h a1, a1, a5
+; CHECK-NEXT:    pack a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = urem <8 x i8> %a, %b
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_purem_w(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_purem_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    remuw a3, a1, a2
+; CHECK-NEXT:    srli a2, a2, 32
+; CHECK-NEXT:    srli a1, a1, 32
+; CHECK-NEXT:    remuw a1, a1, a2
+; CHECK-NEXT:    pack a1, a3, a1
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  %b = load <2 x i32>, ptr %b_ptr
+  %res = urem <2 x i32> %a, %b
+  store <2 x i32> %res, ptr %ret_ptr
+  ret void
+}



More information about the llvm-commits mailing list