[llvm] [RISCV] Optimize divide by constant for VP intrinsics (PR #125991)
Jesse Huang via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 19 06:20:16 PST 2025
https://github.com/jaidTw updated https://github.com/llvm/llvm-project/pull/125991
>From 00095e1e42a7075dfc258808962de386e4f35a5d Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+fakepaper56 at users.noreply.github.com>
Date: Tue, 31 Jan 2023 09:52:06 +0800
Subject: [PATCH 1/7] [VP][RISCV] Add vp isd opcodes VP_MULH/VP_MULHS
Add ISD opcodes VP_MULH/VP_MULHS which could be used by VP optimizations.
---
llvm/include/llvm/IR/VPIntrinsics.def | 5 +++++
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 8 ++++----
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 5 +++++
3 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 55f4719da7c8b..e71ca44779adb 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -180,6 +180,11 @@ HELPER_REGISTER_BINARY_INT_VP(vp_xor, VP_XOR, Xor, XOR)
#undef HELPER_REGISTER_BINARY_INT_VP
+BEGIN_REGISTER_VP_SDNODE(VP_MULHU, -1, vp_mulhs, 2, 3)
+END_REGISTER_VP_SDNODE(VP_MULHU)
+BEGIN_REGISTER_VP_SDNODE(VP_MULHS, -1, vp_mulhs, 2, 3)
+END_REGISTER_VP_SDNODE(VP_MULHS)
+
// llvm.vp.smin(x,y,mask,vlen)
BEGIN_REGISTER_VP(vp_smin, 2, 3, VP_SMIN, -1)
VP_PROPERTY_BINARYOP
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1000235ab4061..6e2f37d7c3dd4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1277,8 +1277,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::ADD: case ISD::VP_ADD:
case ISD::SUB: case ISD::VP_SUB:
case ISD::MUL: case ISD::VP_MUL:
- case ISD::MULHS:
- case ISD::MULHU:
+ case ISD::MULHS: case ISD::VP_MULHS:
+ case ISD::MULHU: case ISD::VP_MULHU:
case ISD::ABDS:
case ISD::ABDU:
case ISD::AVGCEILS:
@@ -4552,8 +4552,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
case ISD::ADD: case ISD::VP_ADD:
case ISD::AND: case ISD::VP_AND:
case ISD::MUL: case ISD::VP_MUL:
- case ISD::MULHS:
- case ISD::MULHU:
+ case ISD::MULHS: case ISD::VP_MULHS:
+ case ISD::MULHU: case ISD::VP_MULHU:
case ISD::ABDS:
case ISD::ABDU:
case ISD::OR: case ISD::VP_OR:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8e3caf51d876b..3a4f1fefa9445 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -696,6 +696,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::VP_FP_TO_UINT, ISD::VP_SETCC, ISD::VP_SIGN_EXTEND,
ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE, ISD::VP_SMIN,
ISD::VP_SMAX, ISD::VP_UMIN, ISD::VP_UMAX,
+ ISD::VP_MULHU, ISD::VP_MULHS,
ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE,
ISD::VP_SADDSAT, ISD::VP_UADDSAT, ISD::VP_SSUBSAT,
ISD::VP_USUBSAT, ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF,
@@ -6410,6 +6411,8 @@ static unsigned getRISCVVLOp(SDValue Op) {
VP_CASE(ADD) // VP_ADD
VP_CASE(SUB) // VP_SUB
VP_CASE(MUL) // VP_MUL
+ VP_CASE(MULHS) // VP_MULHS
+ VP_CASE(MULHU) // VP_MULHU
VP_CASE(SDIV) // VP_SDIV
VP_CASE(SREM) // VP_SREM
VP_CASE(UDIV) // VP_UDIV
@@ -7605,6 +7608,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::VP_ADD:
case ISD::VP_SUB:
case ISD::VP_MUL:
+ case ISD::VP_MULHS:
+ case ISD::VP_MULHU:
case ISD::VP_SDIV:
case ISD::VP_UDIV:
case ISD::VP_SREM:
>From bf0b608a2a52e8b6a9edf7cc725c85de7bd47858 Mon Sep 17 00:00:00 2001
From: Jesse Huang <jesse.huang at sifive.com>
Date: Fri, 10 Feb 2023 11:52:47 +0800
Subject: [PATCH 2/7] [LLVM][VP] Optimize divide by constant for VP intrinsics
This patch implemented divide by constants foldings for vp.u(s)div and vp.u(s)rem as well as some other minor foldings such as div by pow of 2, div by INT_MAX, etc.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 4 +
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 330 ++++
.../CodeGen/SelectionDAG/TargetLowering.cpp | 253 +++
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 10 +
llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll | 1540 +++++++++++++++++
5 files changed, 2137 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 04ee24c0916e5..6447752c451d8 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5108,6 +5108,10 @@ class TargetLowering : public TargetLoweringBase {
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const;
+ SDValue BuildVPSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
+ SmallVectorImpl<SDNode *> &Created) const;
+ SDValue BuildVPUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
+ SmallVectorImpl<SDNode *> &Created) const;
/// Targets may override this function to provide custom SDIV lowering for
/// power-of-2 denominators. If the target returns an empty SDValue, LLVM
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8858c2012c706..74ab35f8c5f05 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -564,6 +564,14 @@ namespace {
SDValue visitFSUBForFMACombine(SDNode *N);
SDValue visitFMULForFMADistributiveCombine(SDNode *N);
+ SDValue visitVPUDIV(SDNode *N);
+ SDValue visitVPUDIVLike(SDValue N0, SDValue N1, SDNode *N);
+ SDValue BuildVPUDIV(SDNode *N);
+ SDValue visitVPSDIV(SDNode *N);
+ SDValue visitVPSDIVLike(SDValue N0, SDValue N1, SDNode *N);
+ SDValue BuildVPSDIV(SDNode *N);
+ SDValue visitVPREM(SDNode *N);
+
SDValue XformToShuffleWithZero(SDNode *N);
bool reassociationCanBreakAddressingModePattern(unsigned Opc,
const SDLoc &DL,
@@ -5161,6 +5169,59 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
return SDValue();
}
+// handles ISD::VP_SREM and ISD::VP_UREM
+SDValue DAGCombiner::visitVPREM(SDNode *N) {
+ unsigned Opcode = N->getOpcode();
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue Mask = N->getOperand(2);
+ SDValue VL = N->getOperand(3);
+ EVT VT = N->getValueType(0);
+ EVT CCVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorElementCount());
+
+ bool IsSigned = (Opcode == ISD::VP_SREM);
+ SDLoc DL(N);
+
+ // fold (vp.urem X, -1) -> select(FX == -1, 0, FX)
+ // Freeze the numerator to avoid a miscompile with an undefined value.
+ if (!IsSigned && llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false)) {
+ SDValue F0 = DAG.getFreeze(N0);
+ SDValue EqualsNeg1 = DAG.getSetCCVP(DL, CCVT, F0, N1, ISD::SETEQ, Mask, VL);
+ return DAG.getNode(ISD::VP_SELECT, DL, VT, EqualsNeg1,
+ DAG.getConstant(0, DL, VT), F0, VL);
+ }
+
+ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+
+ // If X/C can be simplified by the division-by-constant logic, lower
+ // X%C to the equivalent of X-X/C*C.
+ // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the
+ // speculative DIV must not cause a DIVREM conversion. We guard against this
+ // by skipping the simplification if isIntDivCheap(). When div is not cheap,
+ // combine will not return a DIVREM. Regardless, checking cheapness here
+ // makes sense since the simplification results in fatter code.
+ if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) {
+ SDValue OptimizedDiv =
+ IsSigned ? visitVPSDIVLike(N0, N1, N) : visitVPUDIVLike(N0, N1, N);
+ if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != N) {
+ // If the equivalent Div node also exists, update its users.
+ unsigned DivOpcode = IsSigned ? ISD::VP_SDIV : ISD::VP_UDIV;
+ if (SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(),
+ {N0, N1, Mask, VL}))
+ CombineTo(DivNode, OptimizedDiv);
+ SDValue Mul =
+ DAG.getNode(ISD::VP_MUL, DL, VT, OptimizedDiv, N1, Mask, VL);
+ SDValue Sub = DAG.getNode(ISD::VP_SUB, DL, VT, N0, Mul, Mask, VL);
+ AddToWorklist(OptimizedDiv.getNode());
+ AddToWorklist(Mul.getNode());
+ return Sub;
+ }
+ }
+
+ return SDValue();
+}
+
SDValue DAGCombiner::visitMULHS(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -27219,6 +27280,268 @@ SDValue DAGCombiner::visitVP_FSUB(SDNode *N) {
return SDValue();
}
+SDValue DAGCombiner::BuildVPUDIV(SDNode *N) {
+ // when optimising for minimum size, we don't want to expand a div to a mul
+ // and a shift.
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
+ return SDValue();
+
+ SmallVector<SDNode *, 8> Built;
+ if (SDValue S = TLI.BuildVPUDIV(N, DAG, LegalOperations, Built)) {
+ for (SDNode *N : Built)
+ AddToWorklist(N);
+ return S;
+ }
+
+ return SDValue();
+}
+
+/// Given an ISD::VP_SDIV node expressing a divide by constant, return
+/// a DAG expression to select that will generate the same value by multiplying
+/// by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
+SDValue DAGCombiner::BuildVPSDIV(SDNode *N) {
+ // when optimising for minimum size, we don't want to expand a div to a mul
+ // and a shift.
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
+ return SDValue();
+
+ SmallVector<SDNode *, 8> Built;
+ if (SDValue S = TLI.BuildVPSDIV(N, DAG, LegalOperations, Built)) {
+ for (SDNode *N : Built)
+ AddToWorklist(N);
+ return S;
+ }
+
+ return SDValue();
+}
+
+SDValue DAGCombiner::visitVPUDIV(SDNode *N) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue Mask = N->getOperand(2);
+ SDValue VL = N->getOperand(3);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ ConstantSDNode *N1C = isConstOrConstSplat(N1);
+ // fold (vp.udiv X, -1) -> vp.select(X == -1, 1, 0)
+ if (N1C && N1C->isAllOnes()) {
+ EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorElementCount());
+ return DAG.getNode(ISD::VP_SELECT, DL, VT,
+ DAG.getSetCCVP(DL, CCVT, N0, N1, ISD::SETEQ, Mask, VL),
+ DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT),
+ VL);
+ }
+
+ if (SDValue V = visitVPUDIVLike(N0, N1, N)) {
+ // If the corresponding remainder node exists, update its users with
+ // (Dividend - (Quotient * Divisor).
+ if (SDNode *RemNode = DAG.getNodeIfExists(ISD::VP_UREM, N->getVTList(),
+ {N0, N1, Mask, VL})) {
+ SDValue Mul = DAG.getNode(ISD::VP_MUL, DL, VT, V, N1, Mask, VL);
+ SDValue Sub = DAG.getNode(ISD::VP_SUB, DL, VT, N0, Mul, Mask, VL);
+ AddToWorklist(Mul.getNode());
+ AddToWorklist(Sub.getNode());
+ CombineTo(RemNode, Sub);
+ }
+ return V;
+ }
+
+ return SDValue();
+}
+
+SDValue DAGCombiner::visitVPUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
+ SDLoc DL(N);
+ SDValue Mask = N->getOperand(2);
+ SDValue VL = N->getOperand(3);
+ EVT VT = N->getValueType(0);
+
+ // fold (vp.udiv x, (1 << c)) -> vp.lshr(x, c)
+ if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
+ DAG.isKnownToBeAPowerOfTwo(N1)) {
+ SDValue LogBase2 = BuildLogBase2(N1, DL);
+ AddToWorklist(LogBase2.getNode());
+
+ EVT ShiftVT = getShiftAmountTy(N0.getValueType());
+ SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
+ AddToWorklist(Trunc.getNode());
+ return DAG.getNode(ISD::VP_SRL, DL, VT, N0, Trunc, Mask, VL);
+ }
+
+ // fold (vp.udiv x, (vp.shl c, y)) -> vp.lshr(x, vp.add(log2(c)+y)) iff c is
+ // power of 2
+ if (N1.getOpcode() == ISD::VP_SHL && N1->getOperand(2) == Mask &&
+ N1->getOperand(3) == VL) {
+ SDValue N10 = N1.getOperand(0);
+ if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) &&
+ DAG.isKnownToBeAPowerOfTwo(N10)) {
+ SDValue LogBase2 = BuildLogBase2(N10, DL);
+ AddToWorklist(LogBase2.getNode());
+
+ EVT ADDVT = N1.getOperand(1).getValueType();
+ SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT);
+ AddToWorklist(Trunc.getNode());
+ SDValue Add = DAG.getNode(ISD::VP_ADD, DL, ADDVT, N1.getOperand(1), Trunc,
+ Mask, VL);
+ AddToWorklist(Add.getNode());
+ return DAG.getNode(ISD::VP_SRL, DL, VT, N0, Add, Mask, VL);
+ }
+ }
+
+ // fold (vp.udiv x, Splat(shl c, y)) -> vp.lshr(x, add(log2(c)+y)) iff c is
+ // power of 2
+ if (N1.getOpcode() == ISD::SPLAT_VECTOR) {
+ SDValue N10 = N1.getOperand(0);
+ if (N10.getOpcode() == ISD::SHL) {
+ SDValue N0SHL = N10.getOperand(0);
+ if (isa<ConstantSDNode>(N0SHL) && DAG.isKnownToBeAPowerOfTwo(N0SHL)) {
+ SDValue LogBase2 = BuildLogBase2(N0SHL, DL);
+ AddToWorklist(LogBase2.getNode());
+
+ EVT ADDVT = N10.getOperand(1).getValueType();
+ SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT);
+ AddToWorklist(Trunc.getNode());
+ SDValue Add =
+ DAG.getNode(ISD::ADD, DL, ADDVT, N10.getOperand(1), Trunc);
+ AddToWorklist(Add.getNode());
+ SDValue Splat = DAG.getSplatVector(VT, DL, Add);
+ AddToWorklist(Splat.getNode());
+ return DAG.getNode(ISD::VP_SRL, DL, VT, N0, Splat, Mask, VL);
+ }
+ }
+ }
+
+ // fold (udiv x, c) -> alternate
+ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+ if (isConstantOrConstantVector(N1) &&
+ !TLI.isIntDivCheap(N->getValueType(0), Attr))
+ if (SDValue Op = BuildVPUDIV(N))
+ return Op;
+ return SDValue();
+}
+
+SDValue DAGCombiner::visitVPSDIV(SDNode *N) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue Mask = N->getOperand(2);
+ SDValue VL = N->getOperand(3);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // fold (vp.sdiv X, -1) -> 0-X
+ ConstantSDNode *N1C = isConstOrConstSplat(N1);
+ if (N1C && N1C->isAllOnes())
+ return DAG.getNode(ISD::VP_SUB, DL, VT, DAG.getConstant(0, DL, VT), N0,
+ Mask, VL);
+
+ // fold (vp.sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0)
+ if (N1C && N1C->getAPIntValue().isMinSignedValue()) {
+ EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorElementCount());
+ return DAG.getNode(ISD::VP_SELECT, DL, VT,
+ DAG.getSetCCVP(DL, CCVT, N0, N1, ISD::SETEQ, Mask, VL),
+ DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT),
+ VL);
+ }
+
+ // If we know the sign bits of both operands are zero, strength reduce to a
+ // vp.udiv instead. Handles (X&15) /s 4 -> X&15 >> 2
+ if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
+ return DAG.getNode(ISD::VP_UDIV, DL, N1.getValueType(), N0, N1, Mask, VL);
+
+ if (SDValue V = visitVPSDIVLike(N0, N1, N)) {
+ // If the corresponding remainder node exists, update its users with
+ // (Dividend - (Quotient * Divisor).
+ if (SDNode *RemNode = DAG.getNodeIfExists(ISD::VP_SREM, N->getVTList(),
+ {N0, N1, Mask, VL})) {
+ SDValue Mul = DAG.getNode(ISD::VP_MUL, DL, VT, V, N1, Mask, VL);
+ SDValue Sub = DAG.getNode(ISD::VP_SUB, DL, VT, N0, Mul, Mask, VL);
+ AddToWorklist(Mul.getNode());
+ AddToWorklist(Sub.getNode());
+ CombineTo(RemNode, Sub);
+ }
+ return V;
+ }
+ return SDValue();
+}
+
+SDValue DAGCombiner::visitVPSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
+ SDLoc DL(N);
+ SDValue Mask = N->getOperand(2);
+ SDValue VL = N->getOperand(3);
+ EVT VT = N->getValueType(0);
+ unsigned BitWidth = VT.getScalarSizeInBits();
+
+ // fold (vp.sdiv X, V of pow 2)
+ if (N1.getOpcode() == ISD::SPLAT_VECTOR &&
+ isDivisorPowerOfTwo(N1.getOperand(0))) {
+ // Create constants that are functions of the shift amount value.
+ SDValue N = N1.getOperand(0);
+ EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorElementCount());
+ EVT ScalarShiftAmtTy =
+ getShiftAmountTy(N0.getValueType().getVectorElementType());
+ SDValue Bits = DAG.getConstant(BitWidth, DL, ScalarShiftAmtTy);
+ SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT.getVectorElementType(), N);
+ C1 = DAG.getZExtOrTrunc(C1, DL, ScalarShiftAmtTy);
+ SDValue Inexact = DAG.getNode(ISD::SUB, DL, ScalarShiftAmtTy, Bits, C1);
+ if (!isa<ConstantSDNode>(Inexact))
+ return SDValue();
+
+ // Splat the sign bit into the register
+ EVT VecShiftAmtTy = EVT::getVectorVT(*DAG.getContext(), ScalarShiftAmtTy,
+ VT.getVectorElementCount());
+ SDValue Sign =
+ DAG.getNode(ISD::VP_SRA, DL, VT, N0,
+ DAG.getConstant(BitWidth - 1, DL, VecShiftAmtTy), Mask, VL);
+ AddToWorklist(Sign.getNode());
+
+ // Add N0, ((N0 < 0) ? abs(N1) - 1 : 0);
+ Inexact = DAG.getSplat(VT, DL, Inexact);
+ C1 = DAG.getSplat(VT, DL, C1);
+ SDValue Srl = DAG.getNode(ISD::VP_SRL, DL, VT, Sign, Inexact, Mask, VL);
+ AddToWorklist(Srl.getNode());
+ SDValue Add = DAG.getNode(ISD::VP_ADD, DL, VT, N0, Srl, Mask, VL);
+ AddToWorklist(Add.getNode());
+ SDValue Sra = DAG.getNode(ISD::VP_SRA, DL, VT, Add, C1, Mask, VL);
+ AddToWorklist(Sra.getNode());
+
+ // Special case: (sdiv X, 1) -> X
+ // Special Case: (sdiv X, -1) -> 0-X
+ SDValue One = DAG.getConstant(1, DL, VT);
+ SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
+ SDValue IsOne = DAG.getSetCCVP(DL, CCVT, N1, One, ISD::SETEQ, Mask, VL);
+ SDValue IsAllOnes =
+ DAG.getSetCCVP(DL, CCVT, N1, AllOnes, ISD::SETEQ, Mask, VL);
+ SDValue IsOneOrAllOnes =
+ DAG.getNode(ISD::VP_OR, DL, CCVT, IsOne, IsAllOnes, Mask, VL);
+ Sra = DAG.getNode(ISD::VP_SELECT, DL, VT, IsOneOrAllOnes, N0, Sra, VL);
+
+ // If dividing by a positive value, we're done. Otherwise, the result must
+ // be negated.
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue Sub = DAG.getNode(ISD::VP_SUB, DL, VT, Zero, Sra, Mask, VL);
+
+ // FIXME: Use SELECT_CC once we improve SELECT_CC constant-folding.
+ SDValue IsNeg = DAG.getSetCCVP(DL, CCVT, N1, Zero, ISD::SETLT, Mask, VL);
+ SDValue Res = DAG.getNode(ISD::VP_SELECT, DL, VT, IsNeg, Sub, Sra, VL);
+ return Res;
+ }
+
+ // If integer divide is expensive and we satisfy the requirements, emit an
+ // alternate sequence. Targets may check function attributes for size/speed
+ // trade-offs.
+ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+ if (isConstantOrConstantVector(N1) &&
+ !TLI.isIntDivCheap(N->getValueType(0), Attr))
+ if (SDValue Op = BuildVPSDIV(N))
+ return Op;
+
+ return SDValue();
+}
+
SDValue DAGCombiner::visitVPOp(SDNode *N) {
if (N->getOpcode() == ISD::VP_GATHER)
@@ -27262,6 +27585,13 @@ SDValue DAGCombiner::visitVPOp(SDNode *N) {
return visitMUL<VPMatchContext>(N);
case ISD::VP_SUB:
return foldSubCtlzNot<VPMatchContext>(N, DAG);
+ case ISD::VP_UDIV:
+ return visitVPUDIV(N);
+ case ISD::VP_SDIV:
+ return visitVPSDIV(N);
+ case ISD::VP_UREM:
+ case ISD::VP_SREM:
+ return visitVPREM(N);
default:
break;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index adfb96041c5c0..82a2500ff386d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6492,6 +6492,121 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, dl, VT, Q, T);
}
+/// Given an ISD::VP_SDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
+SDValue TargetLowering::BuildVPSDIV(SDNode *N, SelectionDAG &DAG,
+ bool IsAfterLegalization,
+ SmallVectorImpl<SDNode *> &Created) const {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+ EVT ShSVT = ShVT.getScalarType();
+ unsigned EltBits = VT.getScalarSizeInBits();
+
+ // Check to see if we can do this.
+ if (!isTypeLegal(VT))
+ return SDValue();
+
+ SmallVector<SDValue, 16> MagicFactors, Factors, Shifts, ShiftMasks;
+
+ auto BuildSDIVPattern = [&](ConstantSDNode *C) {
+ if (C->isZero())
+ return false;
+
+ const APInt &Divisor = C->getAPIntValue();
+ SignedDivisionByConstantInfo magics =
+ SignedDivisionByConstantInfo::get(Divisor);
+ int NumeratorFactor = 0;
+ int ShiftMask = -1;
+
+ if (Divisor.isOne() || Divisor.isAllOnes()) {
+ // If d is +1/-1, we just multiply the numerator by +1/-1.
+ NumeratorFactor = Divisor.getSExtValue();
+ magics.Magic = 0;
+ magics.ShiftAmount = 0;
+ ShiftMask = 0;
+ } else if (Divisor.isStrictlyPositive() && magics.Magic.isNegative()) {
+ // If d > 0 and m < 0, add the numerator.
+ NumeratorFactor = 1;
+ } else if (Divisor.isNegative() && magics.Magic.isStrictlyPositive()) {
+ // If d < 0 and m > 0, subtract the numerator.
+ NumeratorFactor = -1;
+ }
+
+ MagicFactors.push_back(DAG.getConstant(magics.Magic, DL, SVT));
+ Factors.push_back(DAG.getSignedConstant(NumeratorFactor, DL, SVT));
+ Shifts.push_back(DAG.getConstant(magics.ShiftAmount, DL, ShSVT));
+ ShiftMasks.push_back(DAG.getSignedConstant(ShiftMask, DL, SVT));
+ return true;
+ };
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue Mask = N->getOperand(2);
+ SDValue VL = N->getOperand(3);
+
+ // Collect the shifts / magic values from each element.
+ if (!ISD::matchUnaryPredicate(N1, BuildSDIVPattern))
+ return SDValue();
+
+ SDValue MagicFactor, Factor, Shift, ShiftMask;
+ if (N1.getOpcode() == ISD::BUILD_VECTOR) {
+ MagicFactor = DAG.getBuildVector(VT, DL, MagicFactors);
+ Factor = DAG.getBuildVector(VT, DL, Factors);
+ Shift = DAG.getBuildVector(ShVT, DL, Shifts);
+ ShiftMask = DAG.getBuildVector(VT, DL, ShiftMasks);
+ } else if (N1.getOpcode() == ISD::SPLAT_VECTOR) {
+ assert(MagicFactors.size() == 1 && Factors.size() == 1 &&
+ Shifts.size() == 1 && ShiftMasks.size() == 1 &&
+ "Expected matchUnaryPredicate to return one element for scalable "
+ "vectors");
+ MagicFactor = DAG.getSplatVector(VT, DL, MagicFactors[0]);
+ Factor = DAG.getSplatVector(VT, DL, Factors[0]);
+ Shift = DAG.getSplatVector(ShVT, DL, Shifts[0]);
+ ShiftMask = DAG.getSplatVector(VT, DL, ShiftMasks[0]);
+ } else {
+ assert(isa<ConstantSDNode>(N1) && "Expected a constant");
+ MagicFactor = MagicFactors[0];
+ Factor = Factors[0];
+ Shift = Shifts[0];
+ ShiftMask = ShiftMasks[0];
+ }
+
+ // Multiply the numerator (operand 0) by the magic value.
+ auto GetMULHS = [&](SDValue X, SDValue Y) {
+ if (isOperationLegalOrCustom(ISD::VP_MULHS, VT, IsAfterLegalization))
+ return DAG.getNode(ISD::VP_MULHS, DL, VT, X, Y, Mask, VL);
+ return SDValue();
+ };
+
+ SDValue Q = GetMULHS(N0, MagicFactor);
+ if (!Q)
+ return SDValue();
+
+ Created.push_back(Q.getNode());
+
+ // (Optionally) Add/subtract the numerator using Factor.
+ Factor = DAG.getNode(ISD::VP_MUL, DL, VT, N0, Factor, Mask, VL);
+ Created.push_back(Factor.getNode());
+ Q = DAG.getNode(ISD::VP_ADD, DL, VT, Q, Factor, Mask, VL);
+ Created.push_back(Q.getNode());
+
+ // Shift right algebraic by shift value.
+ Q = DAG.getNode(ISD::VP_SRA, DL, VT, Q, Shift, Mask, VL);
+ Created.push_back(Q.getNode());
+
+ // Extract the sign bit, mask it and add it to the quotient.
+ SDValue SignShift = DAG.getConstant(EltBits - 1, DL, ShVT);
+ SDValue T = DAG.getNode(ISD::VP_SRL, DL, VT, Q, SignShift, Mask, VL);
+ Created.push_back(T.getNode());
+ T = DAG.getNode(ISD::VP_AND, DL, VT, T, ShiftMask, Mask, VL);
+ Created.push_back(T.getNode());
+ return DAG.getNode(ISD::VP_ADD, DL, VT, Q, T, Mask, VL);
+}
+
/// Given an ISD::UDIV node expressing a divide by constant,
/// return a DAG expression to select that will generate the same value by
/// multiplying by a magic number.
@@ -6692,6 +6807,144 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
return DAG.getSelect(dl, VT, IsOne, N0, Q);
}
+/// Given an ISD::VP_UDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
+SDValue TargetLowering::BuildVPUDIV(SDNode *N, SelectionDAG &DAG,
+ bool IsAfterLegalization,
+ SmallVectorImpl<SDNode *> &Created) const {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+ EVT ShSVT = ShVT.getScalarType();
+ unsigned EltBits = VT.getScalarSizeInBits();
+
+ // Check to see if we can do this.
+ if (!isTypeLegal(VT))
+ return SDValue();
+
+ bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
+
+ SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
+
+ auto BuildUDIVPattern = [&](ConstantSDNode *C) {
+ if (C->isZero())
+ return false;
+ // FIXME: We should use a narrower constant when the upper
+ // bits are known to be zero.
+ const APInt &Divisor = C->getAPIntValue();
+ SDValue PreShift, MagicFactor, NPQFactor, PostShift;
+
+ // Magic algorithm doesn't work for division by 1. We need to emit a select
+ // at the end.
+ if (Divisor.isOne()) {
+ PreShift = PostShift = DAG.getUNDEF(ShSVT);
+ MagicFactor = NPQFactor = DAG.getUNDEF(SVT);
+ } else {
+ UnsignedDivisionByConstantInfo magics =
+ UnsignedDivisionByConstantInfo::get(Divisor);
+
+ MagicFactor = DAG.getConstant(magics.Magic, DL, SVT);
+
+ assert(magics.PreShift < Divisor.getBitWidth() &&
+ "We shouldn't generate an undefined shift!");
+ assert(magics.PostShift < Divisor.getBitWidth() &&
+ "We shouldn't generate an undefined shift!");
+ assert((!magics.IsAdd || magics.PreShift == 0) && "Unexpected pre-shift");
+ PreShift = DAG.getConstant(magics.PreShift, DL, ShSVT);
+ PostShift = DAG.getConstant(magics.PostShift, DL, ShSVT);
+ NPQFactor = DAG.getConstant(
+ magics.IsAdd ? APInt::getOneBitSet(EltBits, EltBits - 1)
+ : APInt::getZero(EltBits),
+ DL, SVT);
+ UseNPQ |= magics.IsAdd;
+ UsePreShift |= magics.PreShift != 0;
+ UsePostShift |= magics.PostShift != 0;
+ }
+
+ PreShifts.push_back(PreShift);
+ MagicFactors.push_back(MagicFactor);
+ NPQFactors.push_back(NPQFactor);
+ PostShifts.push_back(PostShift);
+ return true;
+ };
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue Mask = N->getOperand(2);
+ SDValue VL = N->getOperand(3);
+
+ // Collect the shifts/magic values from each element.
+ if (!ISD::matchUnaryPredicate(N1, BuildUDIVPattern))
+ return SDValue();
+
+ SDValue PreShift, PostShift, MagicFactor, NPQFactor;
+ if (N1.getOpcode() == ISD::BUILD_VECTOR) {
+ PreShift = DAG.getBuildVector(ShVT, DL, PreShifts);
+ MagicFactor = DAG.getBuildVector(VT, DL, MagicFactors);
+ NPQFactor = DAG.getBuildVector(VT, DL, NPQFactors);
+ PostShift = DAG.getBuildVector(ShVT, DL, PostShifts);
+ } else if (N1.getOpcode() == ISD::SPLAT_VECTOR) {
+ assert(PreShifts.size() == 1 && MagicFactors.size() == 1 &&
+ NPQFactors.size() == 1 && PostShifts.size() == 1 &&
+ "Expected matchUnaryPredicate to return one for scalable vectors");
+ PreShift = DAG.getSplatVector(ShVT, DL, PreShifts[0]);
+ MagicFactor = DAG.getSplatVector(VT, DL, MagicFactors[0]);
+ NPQFactor = DAG.getSplatVector(VT, DL, NPQFactors[0]);
+ PostShift = DAG.getSplatVector(ShVT, DL, PostShifts[0]);
+ } else {
+ assert(isa<ConstantSDNode>(N1) && "Expected a constant");
+ PreShift = PreShifts[0];
+ MagicFactor = MagicFactors[0];
+ PostShift = PostShifts[0];
+ }
+
+ SDValue Q = N0;
+ if (UsePreShift) {
+ Q = DAG.getNode(ISD::VP_SRL, DL, VT, Q, PreShift, Mask, VL);
+ Created.push_back(Q.getNode());
+ }
+
+ auto GetMULHU = [&](SDValue X, SDValue Y) {
+ if (isOperationLegalOrCustom(ISD::VP_MULHU, VT, IsAfterLegalization))
+ return DAG.getNode(ISD::VP_MULHU, DL, VT, X, Y, Mask, VL);
+ return SDValue();
+ };
+
+ // Multiply the numerator (operand 0) by the magic value.
+ Q = GetMULHU(Q, MagicFactor);
+ if (!Q)
+ return SDValue();
+
+ Created.push_back(Q.getNode());
+
+ if (UseNPQ) {
+ SDValue NPQ = DAG.getNode(ISD::VP_SUB, DL, VT, N0, Q, Mask, VL);
+ Created.push_back(NPQ.getNode());
+
+ // For vectors we might have a mix of non-NPQ/NPQ paths, so use
+ // MULHU to act as a SRL-by-1 for NPQ, else multiply by zero.
+ NPQ = GetMULHU(NPQ, NPQFactor);
+ Created.push_back(NPQ.getNode());
+
+ Q = DAG.getNode(ISD::VP_ADD, DL, VT, NPQ, Q, Mask, VL);
+ Created.push_back(Q.getNode());
+ }
+
+ if (UsePostShift) {
+ Q = DAG.getNode(ISD::VP_SRL, DL, VT, Q, PostShift, Mask, VL);
+ Created.push_back(Q.getNode());
+ }
+
+ EVT SetCCVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorElementCount());
+ SDValue One = DAG.getConstant(1, DL, VT);
+ SDValue IsOne = DAG.getSetCCVP(DL, SetCCVT, N1, One, ISD::SETEQ, Mask, VL);
+ return DAG.getNode(ISD::VP_SELECT, DL, VT, IsOne, N0, Q, VL);
+}
+
/// If all values in Values that *don't* match the predicate are same 'splat'
/// value, then replace all values with that splat value.
/// Else, if AlternativeReplacement was provided, then replace all values that
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 3a4f1fefa9445..72b6ba0c2d8ce 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -871,6 +871,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(IntegerVPOps, VT, Custom);
+ // Zve64* does not support VP_MULHU/S with nxvXi64.
+ if (VT.getVectorElementType() == MVT::i64 && !Subtarget.hasStdExtV()) {
+ setOperationAction({ISD::VP_MULHU, ISD::VP_MULHS}, VT, Expand);
+ }
+
setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
setOperationAction({ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER},
@@ -1300,6 +1305,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(IntegerVPOps, VT, Custom);
+ // Zve64* does not support VP_MULHU/S with nxvXi64.
+ if (VT.getVectorElementType() == MVT::i64 && !Subtarget.hasStdExtV()) {
+ setOperationAction({ISD::VP_MULHU, ISD::VP_MULHS}, VT, Expand);
+ }
+
if (Subtarget.hasStdExtZvkb())
setOperationAction({ISD::BSWAP, ISD::ROTL, ISD::ROTR}, VT, Custom);
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll b/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll
new file mode 100644
index 0000000000000..f78a0ec7f2378
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll
@@ -0,0 +1,1540 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s
+
+declare <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.sdiv.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 4 x i16> @llvm.vp.sdiv.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.sdiv.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.urem.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 4 x i16> @llvm.vp.urem.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.urem.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.srem.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 4 x i16> @llvm.vp.srem.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.srem.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.shl.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 4 x i16> @llvm.vp.shl.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.shl.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+
+define <vscale x 8 x i8> @vpudiv_by_max_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_max_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 255, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpudiv_by_max_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_max_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 65535, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpudiv_by_max_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_max_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 4294967295, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpudiv_by_max_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_max_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 18446744073709551615, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @fold_vpudiv_vpurem_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fold_vpudiv_vpurem_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t
+; CHECK-NEXT: li a0, -128
+; CHECK-NEXT: vmul.vx v10, v9, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vadd.vv v8, v9, v8
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 128, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ %u = call <vscale x 8 x i8> @llvm.vp.urem.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ %x = add <vscale x 8 x i8> %v, %u
+ ret <vscale x 8 x i8> %x
+}
+
+define <vscale x 4 x i16> @fold_vpudiv_vpurem_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fold_vpudiv_vpurem_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vsrl.vi v9, v8, 14, v0.t
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vx v10, v9, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vadd.vv v8, v9, v8
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 16384, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ %u = call <vscale x 4 x i16> @llvm.vp.urem.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ %x = add <vscale x 4 x i16> %v, %u
+ ret <vscale x 4 x i16> %x
+}
+
+define <vscale x 2 x i32> @fold_vpudiv_vpurem_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fold_vpudiv_vpurem_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vsrl.vi v9, v8, 14, v0.t
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vx v10, v9, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vadd.vv v8, v9, v8
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 16384, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ %u = call <vscale x 2 x i32> @llvm.vp.urem.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ %x = add <vscale x 2 x i32> %v, %u
+ ret <vscale x 2 x i32> %x
+}
+
+define <vscale x 1 x i64> @fold_vpudiv_vpurem_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fold_vpudiv_vpurem_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vsrl.vi v9, v8, 14, v0.t
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vx v10, v9, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vadd.vv v8, v9, v8
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 16384, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ %u = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ %x = add <vscale x 1 x i64> %v, %u
+ ret <vscale x 1 x i64> %x
+}
+
+define <vscale x 8 x i8> @vpudiv_by_shl2_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_shl2_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+ %sh = shl i8 2, %b
+ %vec = insertelement <vscale x 8 x i8> undef, i8 %sh, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpudiv_by_shl2_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_shl2_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a0, a0, 48
+; CHECK-NEXT: srli a0, a0, 48
+; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+ %sh = shl i16 2, %b
+ %vec = insertelement <vscale x 4 x i16> undef, i16 %sh, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpudiv_by_shl2_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_shl2_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a0, a0, 32
+; CHECK-NEXT: srli a0, a0, 32
+; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+ %sh = shl i32 2, %b
+ %vec = insertelement <vscale x 2 x i32> undef, i32 %sh, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpudiv_by_shl2_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_shl2_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+ %sh = shl i64 2, %b
+ %vec = insertelement <vscale x 1 x i64> undef, i64 %sh, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpudiv_by_vpshl2_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_vpshl2_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vadd.vi v9, v9, 2, v0.t
+; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec1 = insertelement <vscale x 8 x i8> undef, i8 4, i32 0
+ %splat1 = shufflevector <vscale x 8 x i8> %vec1, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %vec2 = insertelement <vscale x 8 x i8> undef, i8 %b, i32 0
+ %splat2 = shufflevector <vscale x 8 x i8> %vec2, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %sh = call <vscale x 8 x i8> @llvm.vp.shl.nxv8i8(<vscale x 8 x i8> %splat1, <vscale x 8 x i8> %splat2, <vscale x 8 x i1> %m, i32 %evl)
+ %v = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %sh, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpudiv_by_vpshl2_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_vpshl2_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vadd.vi v9, v9, 2, v0.t
+; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec1 = insertelement <vscale x 4 x i16> undef, i16 4, i32 0
+ %splat1 = shufflevector <vscale x 4 x i16> %vec1, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %vec2 = insertelement <vscale x 4 x i16> undef, i16 %b, i32 0
+ %splat2 = shufflevector <vscale x 4 x i16> %vec2, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %sh = call <vscale x 4 x i16> @llvm.vp.shl.nxv4i16(<vscale x 4 x i16> %splat1, <vscale x 4 x i16> %splat2, <vscale x 4 x i1> %m, i32 %evl)
+ %v = call <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %sh, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpudiv_by_vpshl2_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_vpshl2_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vadd.vi v9, v9, 2, v0.t
+; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec1 = insertelement <vscale x 2 x i32> undef, i32 4, i32 0
+ %splat1 = shufflevector <vscale x 2 x i32> %vec1, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %vec2 = insertelement <vscale x 2 x i32> undef, i32 %b, i32 0
+ %splat2 = shufflevector <vscale x 2 x i32> %vec2, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %sh = call <vscale x 2 x i32> @llvm.vp.shl.nxv2i32(<vscale x 2 x i32> %splat1, <vscale x 2 x i32> %splat2, <vscale x 2 x i1> %m, i32 %evl)
+ %v = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %sh, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpudiv_by_vpshl2_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_vpshl2_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vadd.vi v9, v9, 2, v0.t
+; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec1 = insertelement <vscale x 1 x i64> undef, i64 4, i32 0
+ %splat1 = shufflevector <vscale x 1 x i64> %vec1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %vec2 = insertelement <vscale x 1 x i64> undef, i64 %b, i32 0
+ %splat2 = shufflevector <vscale x 1 x i64> %vec2, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %sh = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> %splat1, <vscale x 1 x i64> %splat2, <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %sh, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpudiv_by_const_no_add_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_no_add_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 5
+; CHECK-NEXT: li a1, -51
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v9, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 5, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpudiv_by_const_no_add_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_no_add_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 5
+; CHECK-NEXT: lui a1, 1048573
+; CHECK-NEXT: addiw a1, a1, -819
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v9, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 5, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpudiv_by_const_no_add_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_no_add_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 5
+; CHECK-NEXT: lui a1, 838861
+; CHECK-NEXT: addiw a1, a1, -819
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v9, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 5, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpudiv_by_const_no_add_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_no_add_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, %hi(.LCPI19_0)
+; CHECK-NEXT: ld a1, %lo(.LCPI19_0)(a1)
+; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 5
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v9, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 5, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpudiv_by_const_with_add_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_with_add_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 7
+; CHECK-NEXT: li a1, 37
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vsub.vv v11, v8, v10, v0.t
+; CHECK-NEXT: li a0, -128
+; CHECK-NEXT: vmulhu.vx v11, v11, a0, v0.t
+; CHECK-NEXT: vadd.vv v10, v11, v10, v0.t
+; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v9, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 7, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpudiv_by_const_with_add_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_with_add_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 7
+; CHECK-NEXT: lui a1, 2
+; CHECK-NEXT: addiw a1, a1, 1171
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vsub.vv v11, v8, v10, v0.t
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: vmulhu.vx v11, v11, a0, v0.t
+; CHECK-NEXT: vadd.vv v10, v11, v10, v0.t
+; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v9, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 7, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpudiv_by_const_with_add_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_with_add_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 7
+; CHECK-NEXT: lui a1, 149797
+; CHECK-NEXT: addiw a1, a1, -1755
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vsub.vv v11, v8, v10, v0.t
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vmulhu.vx v11, v11, a0, v0.t
+; CHECK-NEXT: vadd.vv v10, v11, v10, v0.t
+; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v9, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 7, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpudiv_by_const_with_add_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_with_add_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, %hi(.LCPI23_0)
+; CHECK-NEXT: ld a1, %lo(.LCPI23_0)(a1)
+; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 7
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vsub.vv v11, v8, v10, v0.t
+; CHECK-NEXT: li a0, -1
+; CHECK-NEXT: slli a0, a0, 63
+; CHECK-NEXT: vmulhu.vx v11, v11, a0, v0.t
+; CHECK-NEXT: vadd.vv v10, v11, v10, v0.t
+; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v9, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 7, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpsdiv_by_neg1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_neg1_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 -1, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.sdiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 1 x i64> @vpsdiv_by_neg1_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_neg1_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 -1, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 4 x i16> @vpsdiv_by_neg1_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_neg1_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 -1, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.sdiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpsdiv_by_neg1_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_neg1_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 -1, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.sdiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 8 x i8> @vpsdiv_by_min_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_min_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, -128
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmseq.vx v0, v8, a1, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 -128, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.sdiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 1 x i64> @vpsdiv_by_min_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_min_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, -1
+; CHECK-NEXT: slli a1, a1, 63
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmseq.vx v0, v8, a1, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 -9223372036854775808, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 4 x i16> @vpsdiv_by_min_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_min_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 1048568
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmseq.vx v0, v8, a1, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 -32768, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.sdiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpsdiv_by_min_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_min_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 524288
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmseq.vx v0, v8, a1, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 -2147483648, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.sdiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 4 x i16> @vpsdiv_pow2_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_pow2_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 4
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsra.vi v11, v8, 15, v0.t
+; CHECK-NEXT: vsrl.vi v11, v11, 14, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 2, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmsgt.vi v0, v11, 4, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 4, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.sdiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 8 x i8> @vpsdiv_pow2_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_pow2_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 4
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsra.vi v11, v8, 7, v0.t
+; CHECK-NEXT: vsrl.vi v11, v11, 6, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 2, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmsgt.vi v0, v11, 4, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 4, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.sdiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 2 x i32> @vpsdiv_pow2_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_pow2_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 4
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsra.vi v11, v8, 31, v0.t
+; CHECK-NEXT: vsrl.vi v11, v11, 30, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 2, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmsgt.vi v0, v11, 4, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 4, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.sdiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpsdiv_pow2_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_pow2_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 4
+; CHECK-NEXT: li a1, 63
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vsra.vx v11, v8, a1, v0.t
+; CHECK-NEXT: li a1, 62
+; CHECK-NEXT: vsrl.vx v11, v11, a1, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 2, v0.t
+; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v0, v10, v12
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmsgt.vi v0, v11, 4, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 4, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpsdiv_const_no_ashr_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_no_ashr_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
+; CHECK-NEXT: li a0, 86
+; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t
+; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 3, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.sdiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpsdiv_const_no_ashr_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_no_ashr_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
+; CHECK-NEXT: lui a0, 5
+; CHECK-NEXT: addiw a0, a0, 1366
+; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t
+; CHECK-NEXT: vsrl.vi v9, v8, 15, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 3, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.sdiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpsdiv_const_no_ashr_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_no_ashr_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
+; CHECK-NEXT: lui a0, 349525
+; CHECK-NEXT: addiw a0, a0, 1366
+; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t
+; CHECK-NEXT: vsrl.vi v9, v8, 31, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 3, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.sdiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpsdiv_const_no_ashr_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_no_ashr_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, %hi(.LCPI39_0)
+; CHECK-NEXT: ld a1, %lo(.LCPI39_0)(a1)
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
+; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 63
+; CHECK-NEXT: vsrl.vx v9, v8, a0, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 3, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpsdiv_const_ashr_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_ashr_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
+; CHECK-NEXT: li a0, 103
+; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t
+; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 5, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.sdiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpsdiv_const_ashr_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_ashr_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
+; CHECK-NEXT: lui a0, 6
+; CHECK-NEXT: addiw a0, a0, 1639
+; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t
+; CHECK-NEXT: vsrl.vi v9, v8, 15, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 5, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.sdiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpsdiv_const_ashr_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_ashr_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
+; CHECK-NEXT: lui a0, 419430
+; CHECK-NEXT: addiw a0, a0, 1639
+; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t
+; CHECK-NEXT: vsrl.vi v9, v8, 31, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 5, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.sdiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpsdiv_const_ashr_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_ashr_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, %hi(.LCPI43_0)
+; CHECK-NEXT: ld a1, %lo(.LCPI43_0)(a1)
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
+; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t
+; CHECK-NEXT: li a0, 63
+; CHECK-NEXT: vsrl.vx v9, v8, a0, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 5, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpsdiv_const_add_ashr_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_add_ashr_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 1
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, a1, v0.t
+; CHECK-NEXT: li a0, -109
+; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 7, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.sdiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpsdiv_const_add_ashr_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_add_ashr_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 1
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, a1, v0.t
+; CHECK-NEXT: lui a0, 1048569
+; CHECK-NEXT: addiw a0, a0, -1911
+; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 3, v0.t
+; CHECK-NEXT: vsrl.vi v9, v8, 15, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 15, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.sdiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpsdiv_const_add_ashr_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_add_ashr_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 1
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, a1, v0.t
+; CHECK-NEXT: lui a0, 599186
+; CHECK-NEXT: addiw a0, a0, 1171
+; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vsrl.vi v9, v8, 31, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 7, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.sdiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpsdiv_const_add_ashr_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_add_ashr_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, %hi(.LCPI47_0)
+; CHECK-NEXT: ld a1, %lo(.LCPI47_0)(a1)
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: vmul.vx v9, v8, a0, v0.t
+; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 3, v0.t
+; CHECK-NEXT: li a0, 63
+; CHECK-NEXT: vsrl.vx v9, v8, a0, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 15, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpsdiv_const_sub_ashr_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, -1
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, a1, v0.t
+; CHECK-NEXT: li a0, 109
+; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 -7, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.sdiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpsdiv_const_sub_ashr_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, -1
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, a1, v0.t
+; CHECK-NEXT: lui a0, 7
+; CHECK-NEXT: addiw a0, a0, 1911
+; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 3, v0.t
+; CHECK-NEXT: vsrl.vi v9, v8, 15, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 -15, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.sdiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpsdiv_const_sub_ashr_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, -1
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, a1, v0.t
+; CHECK-NEXT: lui a0, 449390
+; CHECK-NEXT: addiw a0, a0, -1171
+; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 2, v0.t
+; CHECK-NEXT: vsrl.vi v9, v8, 31, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 -7, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.sdiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpsdiv_const_sub_ashr_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, %hi(.LCPI51_0)
+; CHECK-NEXT: ld a1, %lo(.LCPI51_0)(a1)
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: li a0, -1
+; CHECK-NEXT: vmul.vx v9, v8, a0, v0.t
+; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t
+; CHECK-NEXT: li a0, 63
+; CHECK-NEXT: vsrl.vx v9, v8, a0, v0.t
+; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 -3, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vpurem_by_max_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_max_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t
+; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 18446744073709551615, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 4 x i16> @vpurem_by_max_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_max_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t
+; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 65535, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.urem.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 8 x i8> @vpurem_by_max_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_max_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t
+; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 255, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.urem.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 2 x i32> @vpurem_by_max_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_max_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t
+; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 4294967295, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.urem.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpurem_by_const_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_const_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: lui a1, %hi(.LCPI56_0)
+; CHECK-NEXT: ld a1, %lo(.LCPI56_0)(a1)
+; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 5
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v11, v8, a1, v0.t
+; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
+; CHECK-NEXT: li a0, 5
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmul.vx v10, v10, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 5, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 4 x i16> @vpurem_by_const_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_const_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 5
+; CHECK-NEXT: lui a1, 1048573
+; CHECK-NEXT: addiw a1, a1, -819
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v11, v8, a1, v0.t
+; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
+; CHECK-NEXT: li a0, 5
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmul.vx v10, v10, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 5, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.urem.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 8 x i8> @vpurem_by_const_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_const_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 5
+; CHECK-NEXT: li a1, -51
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v11, v8, a1, v0.t
+; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
+; CHECK-NEXT: li a0, 5
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmul.vx v10, v10, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 5, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.urem.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 2 x i32> @vpurem_by_const_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_const_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 5
+; CHECK-NEXT: lui a1, 838861
+; CHECK-NEXT: addiw a1, a1, -819
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v11, v8, a1, v0.t
+; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
+; CHECK-NEXT: li a0, 5
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmul.vx v10, v10, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 5, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.urem.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpsrem_by_const_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_const_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, %hi(.LCPI60_0)
+; CHECK-NEXT: ld a1, %lo(.LCPI60_0)(a1)
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
+; CHECK-NEXT: vmulh.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT: li a0, 63
+; CHECK-NEXT: vsrl.vx v10, v9, a0, v0.t
+; CHECK-NEXT: vand.vi v10, v10, -1, v0.t
+; CHECK-NEXT: vadd.vv v9, v9, v10, v0.t
+; CHECK-NEXT: li a0, 5
+; CHECK-NEXT: vmul.vx v9, v9, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 5, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 4 x i16> @vpsrem_by_const_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_const_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
+; CHECK-NEXT: lui a0, 6
+; CHECK-NEXT: addiw a0, a0, 1639
+; CHECK-NEXT: vmulh.vx v10, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT: vsrl.vi v10, v9, 15, v0.t
+; CHECK-NEXT: vand.vi v10, v10, -1, v0.t
+; CHECK-NEXT: vadd.vv v9, v9, v10, v0.t
+; CHECK-NEXT: li a0, 5
+; CHECK-NEXT: vmul.vx v9, v9, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 5, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.srem.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 8 x i8> @vpsrem_by_const_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_const_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
+; CHECK-NEXT: li a0, 103
+; CHECK-NEXT: vmulh.vx v10, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT: vsrl.vi v10, v9, 7, v0.t
+; CHECK-NEXT: vand.vi v10, v10, -1, v0.t
+; CHECK-NEXT: vadd.vv v9, v9, v10, v0.t
+; CHECK-NEXT: li a0, 5
+; CHECK-NEXT: vmul.vx v9, v9, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 5, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.srem.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 2 x i32> @vpsrem_by_const_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_const_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
+; CHECK-NEXT: lui a0, 419430
+; CHECK-NEXT: addiw a0, a0, 1639
+; CHECK-NEXT: vmulh.vx v10, v8, a0, v0.t
+; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT: vsrl.vi v10, v9, 31, v0.t
+; CHECK-NEXT: vand.vi v10, v10, -1, v0.t
+; CHECK-NEXT: vadd.vv v9, v9, v10, v0.t
+; CHECK-NEXT: li a0, 5
+; CHECK-NEXT: vmul.vx v9, v9, a0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 5, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.srem.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 8 x i8> @vpudiv_by_1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_1_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 1, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.sdiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpudiv_by_1_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_1_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 1, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.sdiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpudiv_by_1_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_1_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 1, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.sdiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpudiv_by_1_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_1_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 1, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpsdiv_by_1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_1_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 1, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.sdiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpsdiv_by_1_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_1_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 1, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.sdiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpsdiv_by_1_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_1_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 1, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.sdiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpsdiv_by_1_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_1_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 1, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpurem_by_1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_1_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 1, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.urem.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpurem_by_1_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_1_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 1, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.urem.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpurem_by_1_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_1_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 1, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.urem.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpurem_by_1_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_1_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 1, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpsrem_by_1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_1_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 1, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.srem.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpsrem_by_1_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_1_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 1, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.srem.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpsrem_by_1_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_1_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 1, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.srem.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpsrem_by_1_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_1_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 1, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpsrem_by_neg1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_neg1_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i8> undef, i8 -1, i32 0
+ %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = call <vscale x 8 x i8> @llvm.vp.srem.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpsrem_by_neg1_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_neg1_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i16> undef, i16 -1, i32 0
+ %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = call <vscale x 4 x i16> @llvm.vp.srem.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpsrem_by_neg1_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_neg1_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 2 x i32> undef, i32 -1, i32 0
+ %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = call <vscale x 2 x i32> @llvm.vp.srem.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpsrem_by_neg1_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_neg1_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 -1, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpsdivrem_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdivrem_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 109
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
+; CHECK-NEXT: li a0, -7
+; CHECK-NEXT: vsub.vv v9, v9, v8, v0.t
+; CHECK-NEXT: vsra.vi v9, v9, 2, v0.t
+; CHECK-NEXT: vsrl.vi v10, v9, 7, v0.t
+; CHECK-NEXT: vand.vi v10, v10, -1, v0.t
+; CHECK-NEXT: vadd.vv v9, v9, v10, v0.t
+; CHECK-NEXT: vmul.vx v10, v9, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i8> @llvm.vp.srem.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> splat (i8 -7), <vscale x 8 x i1> %m, i32 %evl)
+ %w = call <vscale x 8 x i8> @llvm.vp.sdiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> splat (i8 -7), <vscale x 8 x i1> %m, i32 %evl)
+ %x = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %v, <vscale x 8 x i8> %w, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %x
+}
+
+define <vscale x 8 x i8> @vpudivrem_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudivrem_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 37
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmulhu.vx v9, v8, a1, v0.t
+; CHECK-NEXT: li a0, -128
+; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT: vmulhu.vx v10, v10, a0, v0.t
+; CHECK-NEXT: li a0, 7
+; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vsrl.vi v9, v9, 2, v0.t
+; CHECK-NEXT: vmul.vx v10, v9, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT: ret
+ %v = call <vscale x 8 x i8> @llvm.vp.urem.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> splat (i8 7), <vscale x 8 x i1> %m, i32 %evl)
+ %w = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> splat (i8 7), <vscale x 8 x i1> %m, i32 %evl)
+ %x = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %v, <vscale x 8 x i8> %w, <vscale x 8 x i1> %m, i32 %evl)
+ ret <vscale x 8 x i8> %x
+}
>From b6137759d0766af3b838e3390d7e9ab1204cc3aa Mon Sep 17 00:00:00 2001
From: Jesse Huang <jesse.huang at sifive.com>
Date: Wed, 10 Jan 2024 04:24:20 -0800
Subject: [PATCH 3/7] [RISCV] Set VP_MULH* to Expand on Zve64* and Optimize
BuildVP*DIV
* Set VP_MULHU/VP_MULHS with i64 vector input to Expand on Zve64*
* Moved forward the IsOperationLegalOrCustom check in BuildSDIV/BuildUDIV
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 14 +--
.../CodeGen/RISCV/rvv/vpdiv-by-const-zve64.ll | 113 ++++++++++++++++++
2 files changed, 119 insertions(+), 8 deletions(-)
create mode 100644 llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const-zve64.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 82a2500ff386d..e2b74408885c0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6507,7 +6507,8 @@ SDValue TargetLowering::BuildVPSDIV(SDNode *N, SelectionDAG &DAG,
unsigned EltBits = VT.getScalarSizeInBits();
// Check to see if we can do this.
- if (!isTypeLegal(VT))
+ if (!isTypeLegal(VT) ||
+ !isOperationLegalOrCustom(ISD::VP_MULHS, VT, IsAfterLegalization))
return SDValue();
SmallVector<SDValue, 16> MagicFactors, Factors, Shifts, ShiftMasks;
@@ -6577,9 +6578,7 @@ SDValue TargetLowering::BuildVPSDIV(SDNode *N, SelectionDAG &DAG,
// Multiply the numerator (operand 0) by the magic value.
auto GetMULHS = [&](SDValue X, SDValue Y) {
- if (isOperationLegalOrCustom(ISD::VP_MULHS, VT, IsAfterLegalization))
- return DAG.getNode(ISD::VP_MULHS, DL, VT, X, Y, Mask, VL);
- return SDValue();
+ return DAG.getNode(ISD::VP_MULHS, DL, VT, X, Y, Mask, VL);
};
SDValue Q = GetMULHS(N0, MagicFactor);
@@ -6822,7 +6821,8 @@ SDValue TargetLowering::BuildVPUDIV(SDNode *N, SelectionDAG &DAG,
unsigned EltBits = VT.getScalarSizeInBits();
// Check to see if we can do this.
- if (!isTypeLegal(VT))
+ if (!isTypeLegal(VT) ||
+ !isOperationLegalOrCustom(ISD::VP_MULHU, VT, IsAfterLegalization))
return SDValue();
bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
@@ -6908,9 +6908,7 @@ SDValue TargetLowering::BuildVPUDIV(SDNode *N, SelectionDAG &DAG,
}
auto GetMULHU = [&](SDValue X, SDValue Y) {
- if (isOperationLegalOrCustom(ISD::VP_MULHU, VT, IsAfterLegalization))
- return DAG.getNode(ISD::VP_MULHU, DL, VT, X, Y, Mask, VL);
- return SDValue();
+ return DAG.getNode(ISD::VP_MULHU, DL, VT, X, Y, Mask, VL);
};
// Multiply the numerator (operand 0) by the magic value.
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const-zve64.ll b/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const-zve64.ll
new file mode 100644
index 0000000000000..2fa4abb642270
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const-zve64.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs | FileCheck %s
+
+declare <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+
+define <vscale x 1 x i64> @vpudiv_by_const_no_add_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_no_add_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 5
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vdivu.vx v8, v8, a1, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 5, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vpudiv_by_const_with_add_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_with_add_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 7
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vdivu.vx v8, v8, a1, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 7, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vpsdiv_const_no_ashr_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_no_ashr_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 3
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vdiv.vx v8, v8, a1, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 3, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vpsdiv_const_ashr_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_ashr_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 5
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vdiv.vx v8, v8, a1, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 5, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vpsdiv_const_add_ashr_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_add_ashr_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 15
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vdiv.vx v8, v8, a1, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 15, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vpsdiv_const_sub_ashr_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, -3
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vdiv.vx v8, v8, a1, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 -3, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vpurem_by_const_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpurem_by_const_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 5
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vremu.vx v8, v8, a1, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 5, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vpsrem_by_const_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsrem_by_const_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 5
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vrem.vx v8, v8, a1, v0.t
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 1 x i64> undef, i64 5, i32 0
+ %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i64> %v
+}
>From 81032b1a3891dc5c9612ad5abb7e1fc8bb1f6118 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Mon, 13 Feb 2023 08:39:03 -0800
Subject: [PATCH 4/7] [TargetLowering] Optimize 'factor' code in BuildVPSDIV.
We can't constant fold VP_MUL yet or combine (VP_SUB 0, X) and
VP_ADD.
Add some flags to keep track of when we need to emit VP_MUL/VP_ADD/VP_SUB.
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 25 ++-
llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll | 142 +++++++-----------
2 files changed, 72 insertions(+), 95 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e2b74408885c0..b7846212a94ab 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6511,6 +6511,9 @@ SDValue TargetLowering::BuildVPSDIV(SDNode *N, SelectionDAG &DAG,
!isOperationLegalOrCustom(ISD::VP_MULHS, VT, IsAfterLegalization))
return SDValue();
+ bool AnyFactorOne = false;
+ bool AnyFactorNegOne = false;
+
SmallVector<SDValue, 16> MagicFactors, Factors, Shifts, ShiftMasks;
auto BuildSDIVPattern = [&](ConstantSDNode *C) {
@@ -6529,12 +6532,16 @@ SDValue TargetLowering::BuildVPSDIV(SDNode *N, SelectionDAG &DAG,
magics.Magic = 0;
magics.ShiftAmount = 0;
ShiftMask = 0;
+ AnyFactorOne |= Divisor.isOne();
+ AnyFactorNegOne |= Divisor.isAllOnes();
} else if (Divisor.isStrictlyPositive() && magics.Magic.isNegative()) {
// If d > 0 and m < 0, add the numerator.
NumeratorFactor = 1;
+ AnyFactorOne = true;
} else if (Divisor.isNegative() && magics.Magic.isStrictlyPositive()) {
// If d < 0 and m > 0, subtract the numerator.
NumeratorFactor = -1;
+ AnyFactorNegOne = true;
}
MagicFactors.push_back(DAG.getConstant(magics.Magic, DL, SVT));
@@ -6588,10 +6595,20 @@ SDValue TargetLowering::BuildVPSDIV(SDNode *N, SelectionDAG &DAG,
Created.push_back(Q.getNode());
// (Optionally) Add/subtract the numerator using Factor.
- Factor = DAG.getNode(ISD::VP_MUL, DL, VT, N0, Factor, Mask, VL);
- Created.push_back(Factor.getNode());
- Q = DAG.getNode(ISD::VP_ADD, DL, VT, Q, Factor, Mask, VL);
- Created.push_back(Q.getNode());
+ // FIXME: The AnyFactorOne/NegOne flags are a hack around lack of constant
+ // folding for VP_MUL/ADD.
+ if (AnyFactorOne && AnyFactorNegOne) {
+ Factor = DAG.getNode(ISD::VP_MUL, DL, VT, N0, Factor, Mask, VL);
+ Created.push_back(Factor.getNode());
+ Q = DAG.getNode(ISD::VP_ADD, DL, VT, Q, Factor, Mask, VL);
+ Created.push_back(Q.getNode());
+ } else if (AnyFactorOne) {
+ Q = DAG.getNode(ISD::VP_ADD, DL, VT, Q, N0, Mask, VL);
+ Created.push_back(Q.getNode());
+ } else if (AnyFactorNegOne) {
+ Q = DAG.getNode(ISD::VP_SUB, DL, VT, Q, N0, Mask, VL);
+ Created.push_back(Q.getNode());
+ }
// Shift right algebraic by shift value.
Q = DAG.getNode(ISD::VP_SRA, DL, VT, Q, Shift, Mask, VL);
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll b/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll
index f78a0ec7f2378..6e417e4dd7995 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll
@@ -707,11 +707,9 @@ define <vscale x 1 x i64> @vpsdiv_pow2_nxv1i64(<vscale x 1 x i64> %va, <vscale x
define <vscale x 8 x i8> @vpsdiv_const_no_ashr_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_no_ashr_nxv8i8:
; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 86
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
-; CHECK-NEXT: li a0, 86
-; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t
; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
@@ -726,12 +724,10 @@ define <vscale x 8 x i8> @vpsdiv_const_no_ashr_nxv8i8(<vscale x 8 x i8> %va, <vs
define <vscale x 4 x i16> @vpsdiv_const_no_ashr_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_no_ashr_nxv4i16:
; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 5
+; CHECK-NEXT: addiw a1, a1, 1366
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
-; CHECK-NEXT: lui a0, 5
-; CHECK-NEXT: addiw a0, a0, 1366
-; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 15, v0.t
; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
@@ -746,12 +742,10 @@ define <vscale x 4 x i16> @vpsdiv_const_no_ashr_nxv4i16(<vscale x 4 x i16> %va,
define <vscale x 2 x i32> @vpsdiv_const_no_ashr_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_no_ashr_nxv2i32:
; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 349525
+; CHECK-NEXT: addiw a1, a1, 1366
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
-; CHECK-NEXT: lui a0, 349525
-; CHECK-NEXT: addiw a0, a0, 1366
-; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 31, v0.t
; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
@@ -769,9 +763,7 @@ define <vscale x 1 x i64> @vpsdiv_const_no_ashr_nxv1i64(<vscale x 1 x i64> %va,
; CHECK-NEXT: lui a1, %hi(.LCPI39_0)
; CHECK-NEXT: ld a1, %lo(.LCPI39_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t
; CHECK-NEXT: li a0, 63
; CHECK-NEXT: vsrl.vx v9, v8, a0, v0.t
@@ -787,11 +779,9 @@ define <vscale x 1 x i64> @vpsdiv_const_no_ashr_nxv1i64(<vscale x 1 x i64> %va,
define <vscale x 8 x i8> @vpsdiv_const_ashr_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_ashr_nxv8i8:
; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 103
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
-; CHECK-NEXT: li a0, 103
-; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t
; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
@@ -806,12 +796,10 @@ define <vscale x 8 x i8> @vpsdiv_const_ashr_nxv8i8(<vscale x 8 x i8> %va, <vscal
define <vscale x 4 x i16> @vpsdiv_const_ashr_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_ashr_nxv4i16:
; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 6
+; CHECK-NEXT: addiw a1, a1, 1639
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
-; CHECK-NEXT: lui a0, 6
-; CHECK-NEXT: addiw a0, a0, 1639
-; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 15, v0.t
; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
@@ -826,12 +814,10 @@ define <vscale x 4 x i16> @vpsdiv_const_ashr_nxv4i16(<vscale x 4 x i16> %va, <vs
define <vscale x 2 x i32> @vpsdiv_const_ashr_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_ashr_nxv2i32:
; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 419430
+; CHECK-NEXT: addiw a1, a1, 1639
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
-; CHECK-NEXT: lui a0, 419430
-; CHECK-NEXT: addiw a0, a0, 1639
-; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 31, v0.t
; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
@@ -849,9 +835,7 @@ define <vscale x 1 x i64> @vpsdiv_const_ashr_nxv1i64(<vscale x 1 x i64> %va, <vs
; CHECK-NEXT: lui a1, %hi(.LCPI43_0)
; CHECK-NEXT: ld a1, %lo(.LCPI43_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t
; CHECK-NEXT: li a0, 63
; CHECK-NEXT: vsrl.vx v9, v8, a0, v0.t
@@ -867,12 +851,10 @@ define <vscale x 1 x i64> @vpsdiv_const_ashr_nxv1i64(<vscale x 1 x i64> %va, <vs
define <vscale x 8 x i8> @vpsdiv_const_add_ashr_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_add_ashr_nxv8i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 1
+; CHECK-NEXT: li a1, -109
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, a1, v0.t
-; CHECK-NEXT: li a0, -109
-; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
+; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 2, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t
; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
@@ -887,13 +869,11 @@ define <vscale x 8 x i8> @vpsdiv_const_add_ashr_nxv8i8(<vscale x 8 x i8> %va, <v
define <vscale x 4 x i16> @vpsdiv_const_add_ashr_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_add_ashr_nxv4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 1
+; CHECK-NEXT: lui a1, 1048569
+; CHECK-NEXT: addiw a1, a1, -1911
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, a1, v0.t
-; CHECK-NEXT: lui a0, 1048569
-; CHECK-NEXT: addiw a0, a0, -1911
-; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
+; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 3, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 15, v0.t
; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
@@ -908,13 +888,11 @@ define <vscale x 4 x i16> @vpsdiv_const_add_ashr_nxv4i16(<vscale x 4 x i16> %va,
define <vscale x 2 x i32> @vpsdiv_const_add_ashr_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_add_ashr_nxv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 1
+; CHECK-NEXT: lui a1, 599186
+; CHECK-NEXT: addiw a1, a1, 1171
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, a1, v0.t
-; CHECK-NEXT: lui a0, 599186
-; CHECK-NEXT: addiw a0, a0, 1171
-; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
+; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 2, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 31, v0.t
; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
@@ -932,10 +910,8 @@ define <vscale x 1 x i64> @vpsdiv_const_add_ashr_nxv1i64(<vscale x 1 x i64> %va,
; CHECK-NEXT: lui a1, %hi(.LCPI47_0)
; CHECK-NEXT: ld a1, %lo(.LCPI47_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: li a0, 1
-; CHECK-NEXT: vmul.vx v9, v8, a0, v0.t
-; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
+; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 3, v0.t
; CHECK-NEXT: li a0, 63
; CHECK-NEXT: vsrl.vx v9, v8, a0, v0.t
@@ -951,12 +927,10 @@ define <vscale x 1 x i64> @vpsdiv_const_add_ashr_nxv1i64(<vscale x 1 x i64> %va,
define <vscale x 8 x i8> @vpsdiv_const_sub_ashr_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv8i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, -1
+; CHECK-NEXT: li a1, 109
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, a1, v0.t
-; CHECK-NEXT: li a0, 109
-; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
+; CHECK-NEXT: vsub.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 2, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t
; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
@@ -971,13 +945,11 @@ define <vscale x 8 x i8> @vpsdiv_const_sub_ashr_nxv8i8(<vscale x 8 x i8> %va, <v
define <vscale x 4 x i16> @vpsdiv_const_sub_ashr_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, -1
+; CHECK-NEXT: lui a1, 7
+; CHECK-NEXT: addiw a1, a1, 1911
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, a1, v0.t
-; CHECK-NEXT: lui a0, 7
-; CHECK-NEXT: addiw a0, a0, 1911
-; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
+; CHECK-NEXT: vsub.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 3, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 15, v0.t
; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
@@ -992,13 +964,11 @@ define <vscale x 4 x i16> @vpsdiv_const_sub_ashr_nxv4i16(<vscale x 4 x i16> %va,
define <vscale x 2 x i32> @vpsdiv_const_sub_ashr_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, -1
+; CHECK-NEXT: lui a1, 449390
+; CHECK-NEXT: addiw a1, a1, -1171
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, a1, v0.t
-; CHECK-NEXT: lui a0, 449390
-; CHECK-NEXT: addiw a0, a0, -1171
-; CHECK-NEXT: vmulh.vx v8, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
+; CHECK-NEXT: vsub.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 2, v0.t
; CHECK-NEXT: vsrl.vi v9, v8, 31, v0.t
; CHECK-NEXT: vand.vi v9, v9, -1, v0.t
@@ -1016,10 +986,8 @@ define <vscale x 1 x i64> @vpsdiv_const_sub_ashr_nxv1i64(<vscale x 1 x i64> %va,
; CHECK-NEXT: lui a1, %hi(.LCPI51_0)
; CHECK-NEXT: ld a1, %lo(.LCPI51_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vmul.vx v9, v8, a0, v0.t
-; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
+; CHECK-NEXT: vsub.vv v8, v9, v8, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t
; CHECK-NEXT: li a0, 63
; CHECK-NEXT: vsrl.vx v9, v8, a0, v0.t
@@ -1185,9 +1153,7 @@ define <vscale x 1 x i64> @vpsrem_by_const_nxv1i64(<vscale x 1 x i64> %va, <vsca
; CHECK-NEXT: lui a1, %hi(.LCPI60_0)
; CHECK-NEXT: ld a1, %lo(.LCPI60_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
-; CHECK-NEXT: vmulh.vx v10, v8, a1, v0.t
-; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t
; CHECK-NEXT: li a0, 63
; CHECK-NEXT: vsrl.vx v10, v9, a0, v0.t
@@ -1206,12 +1172,10 @@ define <vscale x 1 x i64> @vpsrem_by_const_nxv1i64(<vscale x 1 x i64> %va, <vsca
define <vscale x 4 x i16> @vpsrem_by_const_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsrem_by_const_nxv4i16:
; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 6
+; CHECK-NEXT: addiw a1, a1, 1639
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
-; CHECK-NEXT: lui a0, 6
-; CHECK-NEXT: addiw a0, a0, 1639
-; CHECK-NEXT: vmulh.vx v10, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t
; CHECK-NEXT: vsrl.vi v10, v9, 15, v0.t
; CHECK-NEXT: vand.vi v10, v10, -1, v0.t
@@ -1229,11 +1193,9 @@ define <vscale x 4 x i16> @vpsrem_by_const_nxv4i16(<vscale x 4 x i16> %va, <vsca
define <vscale x 8 x i8> @vpsrem_by_const_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsrem_by_const_nxv8i8:
; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 103
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
-; CHECK-NEXT: li a0, 103
-; CHECK-NEXT: vmulh.vx v10, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t
; CHECK-NEXT: vsrl.vi v10, v9, 7, v0.t
; CHECK-NEXT: vand.vi v10, v10, -1, v0.t
@@ -1251,12 +1213,10 @@ define <vscale x 8 x i8> @vpsrem_by_const_nxv8i8(<vscale x 8 x i8> %va, <vscale
define <vscale x 2 x i32> @vpsrem_by_const_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsrem_by_const_nxv2i32:
; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, 419430
+; CHECK-NEXT: addiw a1, a1, 1639
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vmul.vx v9, v8, zero, v0.t
-; CHECK-NEXT: lui a0, 419430
-; CHECK-NEXT: addiw a0, a0, 1639
-; CHECK-NEXT: vmulh.vx v10, v8, a0, v0.t
-; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t
; CHECK-NEXT: vsrl.vi v10, v9, 31, v0.t
; CHECK-NEXT: vand.vi v10, v10, -1, v0.t
>From b833489b707156896d7fc28d97c2707aeba59230 Mon Sep 17 00:00:00 2001
From: Jesse Huang <jesse.huang at sifive.com>
Date: Thu, 6 Feb 2025 02:38:26 +0800
Subject: [PATCH 5/7] update test
---
llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll | 613 +++++++++++++-----
1 file changed, 434 insertions(+), 179 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll b/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll
index 6e417e4dd7995..b39fc392482cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll
@@ -28,9 +28,7 @@ define <vscale x 8 x i8> @vpudiv_by_max_nxv8i8(<vscale x 8 x i8> %va, <vscale x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 8 x i8> undef, i8 255, i32 0
@@ -44,9 +42,7 @@ define <vscale x 4 x i16> @vpudiv_by_max_nxv4i16(<vscale x 4 x i16> %va, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 4 x i16> undef, i16 65535, i32 0
@@ -60,9 +56,7 @@ define <vscale x 2 x i32> @vpudiv_by_max_nxv2i32(<vscale x 2 x i32> %va, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t
-; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 2 x i32> undef, i32 4294967295, i32 0
@@ -76,9 +70,7 @@ define <vscale x 1 x i64> @vpudiv_by_max_nxv1i64(<vscale x 1 x i64> %va, <vscale
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t
-; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 1 x i64> undef, i64 18446744073709551615, i32 0
@@ -92,8 +84,7 @@ define <vscale x 8 x i8> @fold_vpudiv_vpurem_nxv8i8(<vscale x 8 x i8> %va, <vsca
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t
-; CHECK-NEXT: li a0, -128
-; CHECK-NEXT: vmul.vx v10, v9, a0, v0.t
+; CHECK-NEXT: vsll.vi v10, v9, 7, v0.t
; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; CHECK-NEXT: vadd.vv v8, v9, v8
@@ -111,8 +102,7 @@ define <vscale x 4 x i16> @fold_vpudiv_vpurem_nxv4i16(<vscale x 4 x i16> %va, <v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vsrl.vi v9, v8, 14, v0.t
-; CHECK-NEXT: lui a0, 4
-; CHECK-NEXT: vmul.vx v10, v9, a0, v0.t
+; CHECK-NEXT: vsll.vi v10, v9, 14, v0.t
; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; CHECK-NEXT: vadd.vv v8, v9, v8
@@ -130,8 +120,7 @@ define <vscale x 2 x i32> @fold_vpudiv_vpurem_nxv2i32(<vscale x 2 x i32> %va, <v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vsrl.vi v9, v8, 14, v0.t
-; CHECK-NEXT: lui a0, 4
-; CHECK-NEXT: vmul.vx v10, v9, a0, v0.t
+; CHECK-NEXT: vsll.vi v10, v9, 14, v0.t
; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
; CHECK-NEXT: vadd.vv v8, v9, v8
@@ -149,8 +138,7 @@ define <vscale x 1 x i64> @fold_vpudiv_vpurem_nxv1i64(<vscale x 1 x i64> %va, <v
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vsrl.vi v9, v8, 14, v0.t
-; CHECK-NEXT: lui a0, 4
-; CHECK-NEXT: vmul.vx v10, v9, a0, v0.t
+; CHECK-NEXT: vsll.vi v10, v9, 14, v0.t
; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; CHECK-NEXT: vadd.vv v8, v9, v8
@@ -197,8 +185,6 @@ define <vscale x 4 x i16> @vpudiv_by_shl2_nxv4i16(<vscale x 4 x i16> %va, i16 %b
define <vscale x 2 x i32> @vpudiv_by_shl2_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_shl2_nxv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: slli a0, a0, 32
-; CHECK-NEXT: srli a0, a0, 32
; CHECK-NEXT: addi a0, a0, 1
; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t
@@ -227,9 +213,8 @@ define <vscale x 1 x i64> @vpudiv_by_shl2_nxv1i64(<vscale x 1 x i64> %va, i64 %b
define <vscale x 8 x i8> @vpudiv_by_vpshl2_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_vpshl2_nxv8i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: vadd.vi v9, v9, 2, v0.t
; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t
; CHECK-NEXT: ret
@@ -245,9 +230,8 @@ define <vscale x 8 x i8> @vpudiv_by_vpshl2_nxv8i8(<vscale x 8 x i8> %va, i8 %b,
define <vscale x 4 x i16> @vpudiv_by_vpshl2_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_vpshl2_nxv4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: vadd.vi v9, v9, 2, v0.t
; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t
; CHECK-NEXT: ret
@@ -263,9 +247,8 @@ define <vscale x 4 x i16> @vpudiv_by_vpshl2_nxv4i16(<vscale x 4 x i16> %va, i16
define <vscale x 2 x i32> @vpudiv_by_vpshl2_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_vpshl2_nxv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: vadd.vi v9, v9, 2, v0.t
; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t
; CHECK-NEXT: ret
@@ -281,9 +264,8 @@ define <vscale x 2 x i32> @vpudiv_by_vpshl2_nxv2i32(<vscale x 2 x i32> %va, i32
define <vscale x 1 x i64> @vpudiv_by_vpshl2_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_vpshl2_nxv1i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: vadd.vi v9, v9, 2, v0.t
; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t
; CHECK-NEXT: ret
@@ -299,11 +281,10 @@ define <vscale x 1 x i64> @vpudiv_by_vpshl2_nxv1i64(<vscale x 1 x i64> %va, i64
define <vscale x 8 x i8> @vpudiv_by_const_no_add_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_const_no_add_nxv8i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v9, 5
-; CHECK-NEXT: li a1, -51
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vmv.v.i v9, 5
+; CHECK-NEXT: li a0, -51
+; CHECK-NEXT: vmulhu.vx v10, v8, a0, v0.t
; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
; CHECK-NEXT: vmseq.vi v0, v9, 1, v0.t
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
@@ -317,12 +298,11 @@ define <vscale x 8 x i8> @vpudiv_by_const_no_add_nxv8i8(<vscale x 8 x i8> %va, <
define <vscale x 4 x i16> @vpudiv_by_const_no_add_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_const_no_add_nxv4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v9, 5
-; CHECK-NEXT: lui a1, 1048573
-; CHECK-NEXT: addiw a1, a1, -819
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vmv.v.i v9, 5
+; CHECK-NEXT: lui a0, 1048573
+; CHECK-NEXT: addi a0, a0, -819
+; CHECK-NEXT: vmulhu.vx v10, v8, a0, v0.t
; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
; CHECK-NEXT: vmseq.vi v0, v9, 1, v0.t
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
@@ -336,12 +316,11 @@ define <vscale x 4 x i16> @vpudiv_by_const_no_add_nxv4i16(<vscale x 4 x i16> %va
define <vscale x 2 x i32> @vpudiv_by_const_no_add_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_const_no_add_nxv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v9, 5
-; CHECK-NEXT: lui a1, 838861
-; CHECK-NEXT: addiw a1, a1, -819
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vmv.v.i v9, 5
+; CHECK-NEXT: lui a0, 838861
+; CHECK-NEXT: addi a0, a0, -819
+; CHECK-NEXT: vmulhu.vx v10, v8, a0, v0.t
; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
; CHECK-NEXT: vmseq.vi v0, v9, 1, v0.t
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
@@ -355,14 +334,16 @@ define <vscale x 2 x i32> @vpudiv_by_const_no_add_nxv2i32(<vscale x 2 x i32> %va
define <vscale x 1 x i64> @vpudiv_by_const_no_add_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_const_no_add_nxv1i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI19_0)
-; CHECK-NEXT: ld a1, %lo(.LCPI19_0)(a1)
-; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v9, 5
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
+; CHECK-NEXT: vmv.v.i v9, 5
+; CHECK-NEXT: lui a0, 838861
+; CHECK-NEXT: vmseq.vi v9, v9, 1, v0.t
+; CHECK-NEXT: addiw a0, a0, -819
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vmulhu.vx v10, v8, a0, v0.t
; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
-; CHECK-NEXT: vmseq.vi v0, v9, 1, v0.t
+; CHECK-NEXT: vmv.v.v v0, v9
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 1 x i64> undef, i64 5, i32 0
@@ -374,13 +355,12 @@ define <vscale x 1 x i64> @vpudiv_by_const_no_add_nxv1i64(<vscale x 1 x i64> %va
define <vscale x 8 x i8> @vpudiv_by_const_with_add_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_const_with_add_nxv8i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v9, 7
-; CHECK-NEXT: li a1, 37
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
-; CHECK-NEXT: vsub.vv v11, v8, v10, v0.t
+; CHECK-NEXT: vmv.v.i v9, 7
+; CHECK-NEXT: li a0, 37
+; CHECK-NEXT: vmulhu.vx v10, v8, a0, v0.t
; CHECK-NEXT: li a0, -128
+; CHECK-NEXT: vsub.vv v11, v8, v10, v0.t
; CHECK-NEXT: vmulhu.vx v11, v11, a0, v0.t
; CHECK-NEXT: vadd.vv v10, v11, v10, v0.t
; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
@@ -396,14 +376,13 @@ define <vscale x 8 x i8> @vpudiv_by_const_with_add_nxv8i8(<vscale x 8 x i8> %va,
define <vscale x 4 x i16> @vpudiv_by_const_with_add_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_const_with_add_nxv4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v9, 7
-; CHECK-NEXT: lui a1, 2
-; CHECK-NEXT: addiw a1, a1, 1171
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
-; CHECK-NEXT: vsub.vv v11, v8, v10, v0.t
+; CHECK-NEXT: vmv.v.i v9, 7
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: addi a0, a0, 1171
+; CHECK-NEXT: vmulhu.vx v10, v8, a0, v0.t
; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: vsub.vv v11, v8, v10, v0.t
; CHECK-NEXT: vmulhu.vx v11, v11, a0, v0.t
; CHECK-NEXT: vadd.vv v10, v11, v10, v0.t
; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
@@ -419,14 +398,13 @@ define <vscale x 4 x i16> @vpudiv_by_const_with_add_nxv4i16(<vscale x 4 x i16> %
define <vscale x 2 x i32> @vpudiv_by_const_with_add_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_const_with_add_nxv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v9, 7
-; CHECK-NEXT: lui a1, 149797
-; CHECK-NEXT: addiw a1, a1, -1755
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
-; CHECK-NEXT: vsub.vv v11, v8, v10, v0.t
+; CHECK-NEXT: vmv.v.i v9, 7
+; CHECK-NEXT: lui a0, 149797
+; CHECK-NEXT: addi a0, a0, -1755
+; CHECK-NEXT: vmulhu.vx v10, v8, a0, v0.t
; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vsub.vv v11, v8, v10, v0.t
; CHECK-NEXT: vmulhu.vx v11, v11, a0, v0.t
; CHECK-NEXT: vadd.vv v10, v11, v10, v0.t
; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
@@ -444,13 +422,12 @@ define <vscale x 1 x i64> @vpudiv_by_const_with_add_nxv1i64(<vscale x 1 x i64> %
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, %hi(.LCPI23_0)
; CHECK-NEXT: ld a1, %lo(.LCPI23_0)(a1)
-; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v9, 7
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 7
; CHECK-NEXT: vmulhu.vx v10, v8, a1, v0.t
-; CHECK-NEXT: vsub.vv v11, v8, v10, v0.t
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: slli a0, a0, 63
+; CHECK-NEXT: vsub.vv v11, v8, v10, v0.t
; CHECK-NEXT: vmulhu.vx v11, v11, a0, v0.t
; CHECK-NEXT: vadd.vv v10, v11, v10, v0.t
; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t
@@ -517,9 +494,7 @@ define <vscale x 8 x i8> @vpsdiv_by_min_nxv8i8(<vscale x 8 x i8> %va, <vscale x
; CHECK-NEXT: li a1, -128
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-NEXT: vmseq.vx v0, v8, a1, v0.t
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 8 x i8> undef, i8 -128, i32 0
@@ -535,9 +510,7 @@ define <vscale x 1 x i64> @vpsdiv_by_min_nxv1i64(<vscale x 1 x i64> %va, <vscale
; CHECK-NEXT: slli a1, a1, 63
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmseq.vx v0, v8, a1, v0.t
-; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 1 x i64> undef, i64 -9223372036854775808, i32 0
@@ -552,9 +525,7 @@ define <vscale x 4 x i16> @vpsdiv_by_min_nxv4i16(<vscale x 4 x i16> %va, <vscale
; CHECK-NEXT: lui a1, 1048568
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmseq.vx v0, v8, a1, v0.t
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 4 x i16> undef, i16 -32768, i32 0
@@ -569,9 +540,7 @@ define <vscale x 2 x i32> @vpsdiv_by_min_nxv2i32(<vscale x 2 x i32> %va, <vscale
; CHECK-NEXT: lui a1, 524288
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmseq.vx v0, v8, a1, v0.t
-; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 2 x i32> undef, i32 -2147483648, i32 0
@@ -583,10 +552,9 @@ define <vscale x 2 x i32> @vpsdiv_by_min_nxv2i32(<vscale x 2 x i32> %va, <vscale
define <vscale x 4 x i16> @vpsdiv_pow2_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_pow2_nxv4i16:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vmv.v.i v10, 4
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
; CHECK-NEXT: vmor.mm v10, v10, v11
@@ -594,15 +562,12 @@ define <vscale x 4 x i16> @vpsdiv_pow2_nxv4i16(<vscale x 4 x i16> %va, <vscale x
; CHECK-NEXT: vsrl.vi v11, v11, 14, v0.t
; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
; CHECK-NEXT: vsra.vi v11, v11, 2, v0.t
+; CHECK-NEXT: vmv.v.i v12, 0
; CHECK-NEXT: vmv.v.v v0, v10
; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
; CHECK-NEXT: vmv1r.v v0, v9
; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v11, 0
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v9
-; CHECK-NEXT: vmsgt.vi v0, v11, 4, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v12, 4, v0.t
; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 4 x i16> undef, i16 4, i32 0
@@ -614,10 +579,9 @@ define <vscale x 4 x i16> @vpsdiv_pow2_nxv4i16(<vscale x 4 x i16> %va, <vscale x
define <vscale x 8 x i8> @vpsdiv_pow2_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_pow2_nxv8i8:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmv.v.i v10, 4
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
; CHECK-NEXT: vmor.mm v10, v10, v11
@@ -625,15 +589,12 @@ define <vscale x 8 x i8> @vpsdiv_pow2_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8
; CHECK-NEXT: vsrl.vi v11, v11, 6, v0.t
; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
; CHECK-NEXT: vsra.vi v11, v11, 2, v0.t
+; CHECK-NEXT: vmv.v.i v12, 0
; CHECK-NEXT: vmv.v.v v0, v10
; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
; CHECK-NEXT: vmv1r.v v0, v9
; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v11, 0
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v9
-; CHECK-NEXT: vmsgt.vi v0, v11, 4, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v12, 4, v0.t
; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 8 x i8> undef, i8 4, i32 0
@@ -645,10 +606,9 @@ define <vscale x 8 x i8> @vpsdiv_pow2_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8
define <vscale x 2 x i32> @vpsdiv_pow2_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_pow2_nxv2i32:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.i v10, 4
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
; CHECK-NEXT: vmor.mm v10, v10, v11
@@ -656,15 +616,12 @@ define <vscale x 2 x i32> @vpsdiv_pow2_nxv2i32(<vscale x 2 x i32> %va, <vscale x
; CHECK-NEXT: vsrl.vi v11, v11, 30, v0.t
; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
; CHECK-NEXT: vsra.vi v11, v11, 2, v0.t
+; CHECK-NEXT: vmv.v.i v12, 0
; CHECK-NEXT: vmv.v.v v0, v10
; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
; CHECK-NEXT: vmv1r.v v0, v9
; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
-; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v11, 0
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v9
-; CHECK-NEXT: vmsgt.vi v0, v11, 4, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v12, 4, v0.t
; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 2 x i32> undef, i32 4, i32 0
@@ -676,26 +633,24 @@ define <vscale x 2 x i32> @vpsdiv_pow2_nxv2i32(<vscale x 2 x i32> %va, <vscale x
define <vscale x 1 x i64> @vpsdiv_pow2_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_pow2_nxv1i64:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; CHECK-NEXT: vmv.v.i v10, 4
-; CHECK-NEXT: li a1, 63
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: vsra.vx v11, v8, a1, v0.t
-; CHECK-NEXT: li a1, 62
-; CHECK-NEXT: vsrl.vx v11, v11, a1, v0.t
+; CHECK-NEXT: li a0, 63
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vsra.vx v12, v8, a0, v0.t
+; CHECK-NEXT: li a0, 62
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsrl.vx v11, v12, a0, v0.t
; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
; CHECK-NEXT: vsra.vi v11, v11, 2, v0.t
-; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t
-; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
-; CHECK-NEXT: vmor.mm v0, v10, v12
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: vmv.v.v v0, v10
; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
; CHECK-NEXT: vmv1r.v v0, v9
; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
-; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v11, 0
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: vmsgt.vi v0, v11, 4, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v12, 4, v0.t
; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 1 x i64> undef, i64 4, i32 0
@@ -725,7 +680,7 @@ define <vscale x 4 x i16> @vpsdiv_const_no_ashr_nxv4i16(<vscale x 4 x i16> %va,
; CHECK-LABEL: vpsdiv_const_no_ashr_nxv4i16:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, 5
-; CHECK-NEXT: addiw a1, a1, 1366
+; CHECK-NEXT: addi a1, a1, 1366
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t
@@ -743,7 +698,7 @@ define <vscale x 2 x i32> @vpsdiv_const_no_ashr_nxv2i32(<vscale x 2 x i32> %va,
; CHECK-LABEL: vpsdiv_const_no_ashr_nxv2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, 349525
-; CHECK-NEXT: addiw a1, a1, 1366
+; CHECK-NEXT: addi a1, a1, 1366
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t
@@ -797,7 +752,7 @@ define <vscale x 4 x i16> @vpsdiv_const_ashr_nxv4i16(<vscale x 4 x i16> %va, <vs
; CHECK-LABEL: vpsdiv_const_ashr_nxv4i16:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, 6
-; CHECK-NEXT: addiw a1, a1, 1639
+; CHECK-NEXT: addi a1, a1, 1639
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t
@@ -815,7 +770,7 @@ define <vscale x 2 x i32> @vpsdiv_const_ashr_nxv2i32(<vscale x 2 x i32> %va, <vs
; CHECK-LABEL: vpsdiv_const_ashr_nxv2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, 419430
-; CHECK-NEXT: addiw a1, a1, 1639
+; CHECK-NEXT: addi a1, a1, 1639
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t
@@ -870,7 +825,7 @@ define <vscale x 4 x i16> @vpsdiv_const_add_ashr_nxv4i16(<vscale x 4 x i16> %va,
; CHECK-LABEL: vpsdiv_const_add_ashr_nxv4i16:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, 1048569
-; CHECK-NEXT: addiw a1, a1, -1911
+; CHECK-NEXT: addi a1, a1, -1911
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t
@@ -889,7 +844,7 @@ define <vscale x 2 x i32> @vpsdiv_const_add_ashr_nxv2i32(<vscale x 2 x i32> %va,
; CHECK-LABEL: vpsdiv_const_add_ashr_nxv2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, 599186
-; CHECK-NEXT: addiw a1, a1, 1171
+; CHECK-NEXT: addi a1, a1, 1171
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t
@@ -907,8 +862,10 @@ define <vscale x 2 x i32> @vpsdiv_const_add_ashr_nxv2i32(<vscale x 2 x i32> %va,
define <vscale x 1 x i64> @vpsdiv_const_add_ashr_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_add_ashr_nxv1i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI47_0)
-; CHECK-NEXT: ld a1, %lo(.LCPI47_0)(a1)
+; CHECK-NEXT: lui a1, 559241
+; CHECK-NEXT: addiw a1, a1, -1911
+; CHECK-NEXT: slli a2, a1, 32
+; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t
@@ -946,7 +903,7 @@ define <vscale x 4 x i16> @vpsdiv_const_sub_ashr_nxv4i16(<vscale x 4 x i16> %va,
; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv4i16:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, 7
-; CHECK-NEXT: addiw a1, a1, 1911
+; CHECK-NEXT: addi a1, a1, 1911
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
; CHECK-NEXT: vsub.vv v8, v9, v8, v0.t
@@ -965,7 +922,7 @@ define <vscale x 2 x i32> @vpsdiv_const_sub_ashr_nxv2i32(<vscale x 2 x i32> %va,
; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, 449390
-; CHECK-NEXT: addiw a1, a1, -1171
+; CHECK-NEXT: addi a1, a1, -1171
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
; CHECK-NEXT: vsub.vv v8, v9, v8, v0.t
@@ -983,8 +940,10 @@ define <vscale x 2 x i32> @vpsdiv_const_sub_ashr_nxv2i32(<vscale x 2 x i32> %va,
define <vscale x 1 x i64> @vpsdiv_const_sub_ashr_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv1i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI51_0)
-; CHECK-NEXT: ld a1, %lo(.LCPI51_0)(a1)
+; CHECK-NEXT: lui a1, 349525
+; CHECK-NEXT: addiw a1, a1, 1365
+; CHECK-NEXT: slli a2, a1, 32
+; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
; CHECK-NEXT: vsub.vv v8, v9, v8, v0.t
@@ -1055,15 +1014,17 @@ define <vscale x 2 x i32> @vpurem_by_max_nxv2i32(<vscale x 2 x i32> %va, <vscale
define <vscale x 1 x i64> @vpurem_by_const_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpurem_by_const_nxv1i64:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: lui a1, %hi(.LCPI56_0)
-; CHECK-NEXT: ld a1, %lo(.LCPI56_0)(a1)
-; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
; CHECK-NEXT: vmv.v.i v10, 5
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: vmulhu.vx v11, v8, a1, v0.t
+; CHECK-NEXT: lui a0, 838861
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: addiw a0, a0, -819
+; CHECK-NEXT: slli a1, a0, 32
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vmulhu.vx v11, v8, a0, v0.t
; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t
-; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
; CHECK-NEXT: li a0, 5
; CHECK-NEXT: vmv1r.v v0, v9
@@ -1079,17 +1040,16 @@ define <vscale x 1 x i64> @vpurem_by_const_nxv1i64(<vscale x 1 x i64> %va, <vsca
define <vscale x 4 x i16> @vpurem_by_const_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpurem_by_const_nxv4i16:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vmv.v.i v10, 5
-; CHECK-NEXT: lui a1, 1048573
-; CHECK-NEXT: addiw a1, a1, -819
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmulhu.vx v11, v8, a1, v0.t
+; CHECK-NEXT: lui a0, 1048573
+; CHECK-NEXT: addi a0, a0, -819
+; CHECK-NEXT: vmulhu.vx v11, v8, a0, v0.t
; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t
; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t
-; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
; CHECK-NEXT: li a0, 5
+; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
; CHECK-NEXT: vmv1r.v v0, v9
; CHECK-NEXT: vmul.vx v10, v10, a0, v0.t
; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
@@ -1103,16 +1063,15 @@ define <vscale x 4 x i16> @vpurem_by_const_nxv4i16(<vscale x 4 x i16> %va, <vsca
define <vscale x 8 x i8> @vpurem_by_const_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpurem_by_const_nxv8i8:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmv.v.i v10, 5
-; CHECK-NEXT: li a1, -51
-; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vmulhu.vx v11, v8, a1, v0.t
+; CHECK-NEXT: li a0, -51
+; CHECK-NEXT: vmulhu.vx v11, v8, a0, v0.t
; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t
; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t
-; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
; CHECK-NEXT: li a0, 5
+; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
; CHECK-NEXT: vmv1r.v v0, v9
; CHECK-NEXT: vmul.vx v10, v10, a0, v0.t
; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
@@ -1126,17 +1085,16 @@ define <vscale x 8 x i8> @vpurem_by_const_nxv8i8(<vscale x 8 x i8> %va, <vscale
define <vscale x 2 x i32> @vpurem_by_const_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpurem_by_const_nxv2i32:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.i v10, 5
-; CHECK-NEXT: lui a1, 838861
-; CHECK-NEXT: addiw a1, a1, -819
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vmulhu.vx v11, v8, a1, v0.t
+; CHECK-NEXT: lui a0, 838861
+; CHECK-NEXT: addi a0, a0, -819
+; CHECK-NEXT: vmulhu.vx v11, v8, a0, v0.t
; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t
; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t
-; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
; CHECK-NEXT: li a0, 5
+; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
; CHECK-NEXT: vmv1r.v v0, v9
; CHECK-NEXT: vmul.vx v10, v10, a0, v0.t
; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
@@ -1154,8 +1112,8 @@ define <vscale x 1 x i64> @vpsrem_by_const_nxv1i64(<vscale x 1 x i64> %va, <vsca
; CHECK-NEXT: ld a1, %lo(.LCPI60_0)(a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
-; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t
; CHECK-NEXT: li a0, 63
+; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t
; CHECK-NEXT: vsrl.vx v10, v9, a0, v0.t
; CHECK-NEXT: vand.vi v10, v10, -1, v0.t
; CHECK-NEXT: vadd.vv v9, v9, v10, v0.t
@@ -1173,7 +1131,7 @@ define <vscale x 4 x i16> @vpsrem_by_const_nxv4i16(<vscale x 4 x i16> %va, <vsca
; CHECK-LABEL: vpsrem_by_const_nxv4i16:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, 6
-; CHECK-NEXT: addiw a1, a1, 1639
+; CHECK-NEXT: addi a1, a1, 1639
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t
@@ -1214,7 +1172,7 @@ define <vscale x 2 x i32> @vpsrem_by_const_nxv2i32(<vscale x 2 x i32> %va, <vsca
; CHECK-LABEL: vpsrem_by_const_nxv2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a1, 419430
-; CHECK-NEXT: addiw a1, a1, 1639
+; CHECK-NEXT: addi a1, a1, 1639
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t
; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t
@@ -1234,6 +1192,23 @@ define <vscale x 2 x i32> @vpsrem_by_const_nxv2i32(<vscale x 2 x i32> %va, <vsca
define <vscale x 8 x i8> @vpudiv_by_1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_1_nxv8i8:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 1
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsra.vi v11, v8, 7, v0.t
+; CHECK-NEXT: vsrl.vi v11, v11, 8, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 8 x i8> undef, i8 1, i32 0
%splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -1244,6 +1219,23 @@ define <vscale x 8 x i8> @vpudiv_by_1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8
define <vscale x 4 x i16> @vpudiv_by_1_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_1_nxv4i16:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 1
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsra.vi v11, v8, 15, v0.t
+; CHECK-NEXT: vsrl.vi v11, v11, 16, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 4 x i16> undef, i16 1, i32 0
%splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -1254,6 +1246,24 @@ define <vscale x 4 x i16> @vpudiv_by_1_nxv4i16(<vscale x 4 x i16> %va, <vscale x
define <vscale x 2 x i32> @vpudiv_by_1_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_1_nxv2i32:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 1
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsra.vi v11, v8, 31, v0.t
+; CHECK-NEXT: vsrl.vx v11, v11, a0, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 2 x i32> undef, i32 1, i32 0
%splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -1264,6 +1274,25 @@ define <vscale x 2 x i32> @vpudiv_by_1_nxv2i32(<vscale x 2 x i32> %va, <vscale x
define <vscale x 1 x i64> @vpudiv_by_1_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudiv_by_1_nxv1i64:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 1
+; CHECK-NEXT: li a0, 63
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vsra.vx v12, v8, a0, v0.t
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsrl.vx v11, v12, a0, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 1 x i64> undef, i64 1, i32 0
%splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
@@ -1274,6 +1303,23 @@ define <vscale x 1 x i64> @vpudiv_by_1_nxv1i64(<vscale x 1 x i64> %va, <vscale x
define <vscale x 8 x i8> @vpsdiv_by_1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_by_1_nxv8i8:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 1
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsra.vi v11, v8, 7, v0.t
+; CHECK-NEXT: vsrl.vi v11, v11, 8, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 8 x i8> undef, i8 1, i32 0
%splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -1284,6 +1330,23 @@ define <vscale x 8 x i8> @vpsdiv_by_1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8
define <vscale x 4 x i16> @vpsdiv_by_1_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_by_1_nxv4i16:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 1
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsra.vi v11, v8, 15, v0.t
+; CHECK-NEXT: vsrl.vi v11, v11, 16, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 4 x i16> undef, i16 1, i32 0
%splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -1294,6 +1357,24 @@ define <vscale x 4 x i16> @vpsdiv_by_1_nxv4i16(<vscale x 4 x i16> %va, <vscale x
define <vscale x 2 x i32> @vpsdiv_by_1_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_by_1_nxv2i32:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 1
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsra.vi v11, v8, 31, v0.t
+; CHECK-NEXT: vsrl.vx v11, v11, a0, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 2 x i32> undef, i32 1, i32 0
%splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -1304,6 +1385,25 @@ define <vscale x 2 x i32> @vpsdiv_by_1_nxv2i32(<vscale x 2 x i32> %va, <vscale x
define <vscale x 1 x i64> @vpsdiv_by_1_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsdiv_by_1_nxv1i64:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 1
+; CHECK-NEXT: li a0, 63
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vsra.vx v12, v8, a0, v0.t
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsrl.vx v11, v12, a0, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
; CHECK-NEXT: ret
%vec = insertelement <vscale x 1 x i64> undef, i64 1, i32 0
%splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
@@ -1314,8 +1414,9 @@ define <vscale x 1 x i64> @vpsdiv_by_1_nxv1i64(<vscale x 1 x i64> %va, <vscale x
define <vscale x 8 x i8> @vpurem_by_1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpurem_by_1_nxv8i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vsrl.vi v9, v8, 0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t
; CHECK-NEXT: ret
%vec = insertelement <vscale x 8 x i8> undef, i8 1, i32 0
%splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -1326,8 +1427,9 @@ define <vscale x 8 x i8> @vpurem_by_1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8
define <vscale x 4 x i16> @vpurem_by_1_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpurem_by_1_nxv4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vsrl.vi v9, v8, 0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t
; CHECK-NEXT: ret
%vec = insertelement <vscale x 4 x i16> undef, i16 1, i32 0
%splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -1338,8 +1440,9 @@ define <vscale x 4 x i16> @vpurem_by_1_nxv4i16(<vscale x 4 x i16> %va, <vscale x
define <vscale x 2 x i32> @vpurem_by_1_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpurem_by_1_nxv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vsrl.vi v9, v8, 0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t
; CHECK-NEXT: ret
%vec = insertelement <vscale x 2 x i32> undef, i32 1, i32 0
%splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -1350,8 +1453,9 @@ define <vscale x 2 x i32> @vpurem_by_1_nxv2i32(<vscale x 2 x i32> %va, <vscale x
define <vscale x 1 x i64> @vpurem_by_1_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpurem_by_1_nxv1i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vsrl.vi v9, v8, 0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t
; CHECK-NEXT: ret
%vec = insertelement <vscale x 1 x i64> undef, i64 1, i32 0
%splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
@@ -1362,8 +1466,25 @@ define <vscale x 1 x i64> @vpurem_by_1_nxv1i64(<vscale x 1 x i64> %va, <vscale x
define <vscale x 8 x i8> @vpsrem_by_1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsrem_by_1_nxv8i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 1
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v12
+; CHECK-NEXT: vsra.vi v12, v8, 7, v0.t
+; CHECK-NEXT: vsrl.vi v12, v12, 8, v0.t
+; CHECK-NEXT: vadd.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsra.vi v12, v12, 0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v12, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v12, v10, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v11, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
; CHECK-NEXT: ret
%vec = insertelement <vscale x 8 x i8> undef, i8 1, i32 0
%splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -1374,8 +1495,25 @@ define <vscale x 8 x i8> @vpsrem_by_1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8
define <vscale x 4 x i16> @vpsrem_by_1_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsrem_by_1_nxv4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 1
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v12
+; CHECK-NEXT: vsra.vi v12, v8, 15, v0.t
+; CHECK-NEXT: vsrl.vi v12, v12, 16, v0.t
+; CHECK-NEXT: vadd.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsra.vi v12, v12, 0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v12, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v12, v10, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v11, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
; CHECK-NEXT: ret
%vec = insertelement <vscale x 4 x i16> undef, i16 1, i32 0
%splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -1386,8 +1524,26 @@ define <vscale x 4 x i16> @vpsrem_by_1_nxv4i16(<vscale x 4 x i16> %va, <vscale x
define <vscale x 2 x i32> @vpsrem_by_1_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsrem_by_1_nxv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 1
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v12
+; CHECK-NEXT: vsra.vi v12, v8, 31, v0.t
+; CHECK-NEXT: vsrl.vx v12, v12, a0, v0.t
+; CHECK-NEXT: vadd.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsra.vi v12, v12, 0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v12, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v12, v10, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v11, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
; CHECK-NEXT: ret
%vec = insertelement <vscale x 2 x i32> undef, i32 1, i32 0
%splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -1398,8 +1554,27 @@ define <vscale x 2 x i32> @vpsrem_by_1_nxv2i32(<vscale x 2 x i32> %va, <vscale x
define <vscale x 1 x i64> @vpsrem_by_1_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsrem_by_1_nxv1i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 1
+; CHECK-NEXT: li a0, 63
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vsra.vx v12, v8, a0, v0.t
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmv.v.i v13, 0
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsrl.vx v11, v12, a0, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v11, v10, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v13, 1, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
; CHECK-NEXT: ret
%vec = insertelement <vscale x 1 x i64> undef, i64 1, i32 0
%splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
@@ -1410,8 +1585,26 @@ define <vscale x 1 x i64> @vpsrem_by_1_nxv1i64(<vscale x 1 x i64> %va, <vscale x
define <vscale x 8 x i8> @vpsrem_by_neg1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsrem_by_neg1_nxv8i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, -1
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v12
+; CHECK-NEXT: vsra.vi v12, v8, 7, v0.t
+; CHECK-NEXT: vsrl.vi v12, v12, 8, v0.t
+; CHECK-NEXT: vadd.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsra.vi v12, v12, 0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v12, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v12, v10, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v11, -1, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v10, 0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
; CHECK-NEXT: ret
%vec = insertelement <vscale x 8 x i8> undef, i8 -1, i32 0
%splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -1422,8 +1615,26 @@ define <vscale x 8 x i8> @vpsrem_by_neg1_nxv8i8(<vscale x 8 x i8> %va, <vscale x
define <vscale x 4 x i16> @vpsrem_by_neg1_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsrem_by_neg1_nxv4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, -1
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v12
+; CHECK-NEXT: vsra.vi v12, v8, 15, v0.t
+; CHECK-NEXT: vsrl.vi v12, v12, 16, v0.t
+; CHECK-NEXT: vadd.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsra.vi v12, v12, 0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v12, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v12, v10, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v11, -1, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v10, 0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
; CHECK-NEXT: ret
%vec = insertelement <vscale x 4 x i16> undef, i16 -1, i32 0
%splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -1434,8 +1645,27 @@ define <vscale x 4 x i16> @vpsrem_by_neg1_nxv4i16(<vscale x 4 x i16> %va, <vscal
define <vscale x 2 x i32> @vpsrem_by_neg1_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsrem_by_neg1_nxv2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, -1
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vmor.mm v10, v10, v12
+; CHECK-NEXT: vsra.vi v12, v8, 31, v0.t
+; CHECK-NEXT: vsrl.vx v12, v12, a0, v0.t
+; CHECK-NEXT: vadd.vv v12, v8, v12, v0.t
+; CHECK-NEXT: vsra.vi v12, v12, 0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v12, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v12, v10, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v11, -1, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v10, 0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
; CHECK-NEXT: ret
%vec = insertelement <vscale x 2 x i32> undef, i32 -1, i32 0
%splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -1446,8 +1676,28 @@ define <vscale x 2 x i32> @vpsrem_by_neg1_nxv2i32(<vscale x 2 x i32> %va, <vscal
define <vscale x 1 x i64> @vpsrem_by_neg1_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpsrem_by_neg1_nxv1i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, -1
+; CHECK-NEXT: li a0, 63
+; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t
+; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t
+; CHECK-NEXT: vsra.vx v12, v8, a0, v0.t
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmv.v.i v13, 0
+; CHECK-NEXT: vmor.mm v10, v10, v11
+; CHECK-NEXT: vsrl.vx v11, v12, a0, v0.t
+; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t
+; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v11, v10, 0, v0.t
+; CHECK-NEXT: vmsgt.vi v0, v13, -1, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vrsub.vi v10, v10, 0, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
; CHECK-NEXT: ret
%vec = insertelement <vscale x 1 x i64> undef, i64 -1, i32 0
%splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
@@ -1480,18 +1730,23 @@ define <vscale x 8 x i8> @vpsdivrem_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x
define <vscale x 8 x i8> @vpudivrem_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpudivrem_nxv8i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 37
; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT: vmulhu.vx v9, v8, a1, v0.t
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 7
+; CHECK-NEXT: li a0, 37
+; CHECK-NEXT: vmulhu.vx v11, v8, a0, v0.t
; CHECK-NEXT: li a0, -128
-; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t
-; CHECK-NEXT: vmulhu.vx v10, v10, a0, v0.t
+; CHECK-NEXT: vsub.vv v12, v8, v11, v0.t
+; CHECK-NEXT: vmulhu.vx v12, v12, a0, v0.t
+; CHECK-NEXT: vadd.vv v11, v12, v11, v0.t
+; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t
+; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t
; CHECK-NEXT: li a0, 7
-; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t
-; CHECK-NEXT: vsrl.vi v9, v9, 2, v0.t
-; CHECK-NEXT: vmul.vx v10, v9, a0, v0.t
-; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmul.vx v11, v10, a0, v0.t
+; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT: vsub.vv v8, v8, v11, v0.t
; CHECK-NEXT: ret
%v = call <vscale x 8 x i8> @llvm.vp.urem.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> splat (i8 7), <vscale x 8 x i1> %m, i32 %evl)
%w = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> splat (i8 7), <vscale x 8 x i1> %m, i32 %evl)
>From 53b16ef51c0379812deeeead518ece3008143e29 Mon Sep 17 00:00:00 2001
From: Jesse Huang <jesse.huang at sifive.com>
Date: Wed, 19 Feb 2025 03:22:15 -0800
Subject: [PATCH 6/7] address comments
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 74ab35f8c5f05..5cd17a203dbf2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27375,7 +27375,7 @@ SDValue DAGCombiner::visitVPUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
if (N1.getOpcode() == ISD::VP_SHL && N1->getOperand(2) == Mask &&
N1->getOperand(3) == VL) {
SDValue N10 = N1.getOperand(0);
- if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) &&
+ if (isConstantOrConstantVector(N10, /*NoOpaques=*/ true) &&
DAG.isKnownToBeAPowerOfTwo(N10)) {
SDValue LogBase2 = BuildLogBase2(N10, DL);
AddToWorklist(LogBase2.getNode());
@@ -27416,9 +27416,10 @@ SDValue DAGCombiner::visitVPUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
// fold (udiv x, c) -> alternate
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
if (isConstantOrConstantVector(N1) &&
- !TLI.isIntDivCheap(N->getValueType(0), Attr))
+ !TLI.isIntDivCheap(N->getValueType(0), Attr)) {
if (SDValue Op = BuildVPUDIV(N))
return Op;
+ }
return SDValue();
}
>From 127432cb1683513fa8cb6495520ad10d233e7e72 Mon Sep 17 00:00:00 2001
From: Jesse Huang <jesse.huang at sifive.com>
Date: Wed, 19 Feb 2025 03:25:37 -0800
Subject: [PATCH 7/7] Merge VP_MULHU/HS expand and custom part
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 72b6ba0c2d8ce..dc8603a5376cf 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1283,8 +1283,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Custom);
// vXi64 MULHS/MULHU requires the V extension instead of Zve64*.
- if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV())
+ if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV()) {
setOperationAction({ISD::MULHS, ISD::MULHU}, VT, Custom);
+ } else {
+ setOperationAction({ISD::VP_MULHU, ISD::VP_MULHS}, VT, Expand);
+ }
setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU, ISD::AVGCEILS,
ISD::AVGCEILU, ISD::SADDSAT, ISD::UADDSAT,
@@ -1305,11 +1308,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(IntegerVPOps, VT, Custom);
- // Zve64* does not support VP_MULHU/S with nxvXi64.
- if (VT.getVectorElementType() == MVT::i64 && !Subtarget.hasStdExtV()) {
- setOperationAction({ISD::VP_MULHU, ISD::VP_MULHS}, VT, Expand);
- }
-
if (Subtarget.hasStdExtZvkb())
setOperationAction({ISD::BSWAP, ISD::ROTL, ISD::ROTR}, VT, Custom);
More information about the llvm-commits
mailing list