[llvm] [RISCV][ISel] Fix bug on invalid extension combine in #72340 (PR #76785)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 4 01:07:49 PST 2024
https://github.com/sun-jacobi updated https://github.com/llvm/llvm-project/pull/76785
>From 60f7b1576ed58a1f909ea36d9bbf66200c7b8ef9 Mon Sep 17 00:00:00 2001
From: sun-jacobi <sun1011jacobi at gmail.com>
Date: Wed, 3 Jan 2024 16:03:33 +0900
Subject: [PATCH 1/2] Recreate "[RISCV][ISel] Combine scalable vector
add/sub/mul with zero/sign extension. (#72340)"
This recreate the #72340 reverted by 4e347b4e38b95bc455d0e620e11ac58fc0172a94.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 233 +++++++---
llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll | 128 ++---
.../RISCV/rvv/vscale-vw-web-simplification.ll | 436 ++++++++++++++++--
3 files changed, 642 insertions(+), 155 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 27bb69dc9868c8..2fb79c81b7f169 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1374,8 +1374,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setPrefLoopAlignment(Subtarget.getPrefLoopAlignment());
setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
- ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::AND,
- ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT});
+ ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::MUL,
+ ISD::AND, ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT});
if (Subtarget.is64Bit())
setTargetDAGCombine(ISD::SRA);
@@ -12850,9 +12850,9 @@ struct CombineResult;
/// Helper class for folding sign/zero extensions.
/// In particular, this class is used for the following combines:
-/// add_vl -> vwadd(u) | vwadd(u)_w
-/// sub_vl -> vwsub(u) | vwsub(u)_w
-/// mul_vl -> vwmul(u) | vwmul_su
+/// add | add_vl -> vwadd(u) | vwadd(u)_w
+/// sub | sub_vl -> vwsub(u) | vwsub(u)_w
+/// mul | mul_vl -> vwmul(u) | vwmul_su
///
/// An object of this class represents an operand of the operation we want to
/// combine.
@@ -12897,6 +12897,8 @@ struct NodeExtensionHelper {
/// E.g., for zext(a), this would return a.
SDValue getSource() const {
switch (OrigOperand.getOpcode()) {
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
case RISCVISD::VSEXT_VL:
case RISCVISD::VZEXT_VL:
return OrigOperand.getOperand(0);
@@ -12913,7 +12915,8 @@ struct NodeExtensionHelper {
/// Get or create a value that can feed \p Root with the given extension \p
/// SExt. If \p SExt is std::nullopt, this returns the source of this operand.
/// \see ::getSource().
- SDValue getOrCreateExtendedOp(const SDNode *Root, SelectionDAG &DAG,
+ SDValue getOrCreateExtendedOp(SDNode *Root, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget,
std::optional<bool> SExt) const {
if (!SExt.has_value())
return OrigOperand;
@@ -12928,8 +12931,10 @@ struct NodeExtensionHelper {
// If we need an extension, we should be changing the type.
SDLoc DL(Root);
- auto [Mask, VL] = getMaskAndVL(Root);
+ auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget);
switch (OrigOperand.getOpcode()) {
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
case RISCVISD::VSEXT_VL:
case RISCVISD::VZEXT_VL:
return DAG.getNode(ExtOpc, DL, NarrowVT, Source, Mask, VL);
@@ -12969,12 +12974,15 @@ struct NodeExtensionHelper {
/// \pre \p Opcode represents a supported root (\see ::isSupportedRoot()).
static unsigned getSameExtensionOpcode(unsigned Opcode, bool IsSExt) {
switch (Opcode) {
+ case ISD::ADD:
case RISCVISD::ADD_VL:
case RISCVISD::VWADD_W_VL:
case RISCVISD::VWADDU_W_VL:
return IsSExt ? RISCVISD::VWADD_VL : RISCVISD::VWADDU_VL;
+ case ISD::MUL:
case RISCVISD::MUL_VL:
return IsSExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
+ case ISD::SUB:
case RISCVISD::SUB_VL:
case RISCVISD::VWSUB_W_VL:
case RISCVISD::VWSUBU_W_VL:
@@ -12987,7 +12995,8 @@ struct NodeExtensionHelper {
/// Get the opcode to materialize \p Opcode(sext(a), zext(b)) ->
/// newOpcode(a, b).
static unsigned getSUOpcode(unsigned Opcode) {
- assert(Opcode == RISCVISD::MUL_VL && "SU is only supported for MUL");
+ assert((Opcode == RISCVISD::MUL_VL || Opcode == ISD::MUL) &&
+ "SU is only supported for MUL");
return RISCVISD::VWMULSU_VL;
}
@@ -12995,8 +13004,10 @@ struct NodeExtensionHelper {
/// newOpcode(a, b).
static unsigned getWOpcode(unsigned Opcode, bool IsSExt) {
switch (Opcode) {
+ case ISD::ADD:
case RISCVISD::ADD_VL:
return IsSExt ? RISCVISD::VWADD_W_VL : RISCVISD::VWADDU_W_VL;
+ case ISD::SUB:
case RISCVISD::SUB_VL:
return IsSExt ? RISCVISD::VWSUB_W_VL : RISCVISD::VWSUBU_W_VL;
default:
@@ -13006,19 +13017,44 @@ struct NodeExtensionHelper {
using CombineToTry = std::function<std::optional<CombineResult>(
SDNode * /*Root*/, const NodeExtensionHelper & /*LHS*/,
- const NodeExtensionHelper & /*RHS*/)>;
+ const NodeExtensionHelper & /*RHS*/, SelectionDAG &,
+ const RISCVSubtarget &)>;
/// Check if this node needs to be fully folded or extended for all users.
bool needToPromoteOtherUsers() const { return EnforceOneUse; }
/// Helper method to set the various fields of this struct based on the
/// type of \p Root.
- void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG) {
+ void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
SupportsZExt = false;
SupportsSExt = false;
EnforceOneUse = true;
CheckMask = true;
- switch (OrigOperand.getOpcode()) {
+ unsigned Opc = OrigOperand.getOpcode();
+ switch (Opc) {
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND: {
+ MVT VT = OrigOperand.getSimpleValueType();
+ if (!VT.isVector())
+ break;
+
+ MVT NarrowVT = OrigOperand.getOperand(0)->getSimpleValueType(0);
+
+ unsigned ScalarBits = VT.getScalarSizeInBits();
+ unsigned NarrowScalarBits = NarrowVT.getScalarSizeInBits();
+
+ // Ensure the extension's semantic is equivalent to rvv vzext or vsext.
+ if (ScalarBits != NarrowScalarBits * 2)
+ break;
+
+ SupportsZExt = Opc == ISD::ZERO_EXTEND;
+ SupportsSExt = Opc == ISD::SIGN_EXTEND;
+
+ SDLoc DL(Root);
+ std::tie(Mask, VL) = getDefaultScalableVLOps(VT, DL, DAG, Subtarget);
+ break;
+ }
case RISCVISD::VZEXT_VL:
SupportsZExt = true;
Mask = OrigOperand.getOperand(1);
@@ -13074,8 +13110,16 @@ struct NodeExtensionHelper {
}
/// Check if \p Root supports any extension folding combines.
- static bool isSupportedRoot(const SDNode *Root) {
+ static bool isSupportedRoot(const SDNode *Root, const SelectionDAG &DAG) {
switch (Root->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL: {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(Root->getValueType(0)))
+ return false;
+ return Root->getValueType(0).isScalableVector();
+ }
case RISCVISD::ADD_VL:
case RISCVISD::MUL_VL:
case RISCVISD::VWADD_W_VL:
@@ -13090,9 +13134,10 @@ struct NodeExtensionHelper {
}
/// Build a NodeExtensionHelper for \p Root.getOperand(\p OperandIdx).
- NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG) {
- assert(isSupportedRoot(Root) && "Trying to build an helper with an "
- "unsupported root");
+ NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ assert(isSupportedRoot(Root, DAG) && "Trying to build an helper with an "
+ "unsupported root");
assert(OperandIdx < 2 && "Requesting something else than LHS or RHS");
OrigOperand = Root->getOperand(OperandIdx);
@@ -13108,7 +13153,7 @@ struct NodeExtensionHelper {
SupportsZExt =
Opc == RISCVISD::VWADDU_W_VL || Opc == RISCVISD::VWSUBU_W_VL;
SupportsSExt = !SupportsZExt;
- std::tie(Mask, VL) = getMaskAndVL(Root);
+ std::tie(Mask, VL) = getMaskAndVL(Root, DAG, Subtarget);
CheckMask = true;
// There's no existing extension here, so we don't have to worry about
// making sure it gets removed.
@@ -13117,7 +13162,7 @@ struct NodeExtensionHelper {
}
[[fallthrough]];
default:
- fillUpExtensionSupport(Root, DAG);
+ fillUpExtensionSupport(Root, DAG, Subtarget);
break;
}
}
@@ -13133,14 +13178,27 @@ struct NodeExtensionHelper {
}
/// Helper function to get the Mask and VL from \p Root.
- static std::pair<SDValue, SDValue> getMaskAndVL(const SDNode *Root) {
- assert(isSupportedRoot(Root) && "Unexpected root");
- return std::make_pair(Root->getOperand(3), Root->getOperand(4));
+ static std::pair<SDValue, SDValue>
+ getMaskAndVL(const SDNode *Root, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ assert(isSupportedRoot(Root, DAG) && "Unexpected root");
+ switch (Root->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL: {
+ SDLoc DL(Root);
+ MVT VT = Root->getSimpleValueType(0);
+ return getDefaultScalableVLOps(VT, DL, DAG, Subtarget);
+ }
+ default:
+ return std::make_pair(Root->getOperand(3), Root->getOperand(4));
+ }
}
/// Check if the Mask and VL of this operand are compatible with \p Root.
- bool areVLAndMaskCompatible(const SDNode *Root) const {
- auto [Mask, VL] = getMaskAndVL(Root);
+ bool areVLAndMaskCompatible(SDNode *Root, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) const {
+ auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget);
return isMaskCompatible(Mask) && isVLCompatible(VL);
}
@@ -13148,11 +13206,14 @@ struct NodeExtensionHelper {
/// foldings that are supported by this class.
static bool isCommutative(const SDNode *N) {
switch (N->getOpcode()) {
+ case ISD::ADD:
+ case ISD::MUL:
case RISCVISD::ADD_VL:
case RISCVISD::MUL_VL:
case RISCVISD::VWADD_W_VL:
case RISCVISD::VWADDU_W_VL:
return true;
+ case ISD::SUB:
case RISCVISD::SUB_VL:
case RISCVISD::VWSUB_W_VL:
case RISCVISD::VWSUBU_W_VL:
@@ -13197,14 +13258,25 @@ struct CombineResult {
/// Return a value that uses TargetOpcode and that can be used to replace
/// Root.
/// The actual replacement is *not* done in that method.
- SDValue materialize(SelectionDAG &DAG) const {
+ SDValue materialize(SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) const {
SDValue Mask, VL, Merge;
- std::tie(Mask, VL) = NodeExtensionHelper::getMaskAndVL(Root);
- Merge = Root->getOperand(2);
+ std::tie(Mask, VL) =
+ NodeExtensionHelper::getMaskAndVL(Root, DAG, Subtarget);
+ switch (Root->getOpcode()) {
+ default:
+ Merge = Root->getOperand(2);
+ break;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ Merge = DAG.getUNDEF(Root->getValueType(0));
+ break;
+ }
return DAG.getNode(TargetOpcode, SDLoc(Root), Root->getValueType(0),
- LHS.getOrCreateExtendedOp(Root, DAG, SExtLHS),
- RHS.getOrCreateExtendedOp(Root, DAG, SExtRHS), Merge,
- Mask, VL);
+ LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtLHS),
+ RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtRHS),
+ Merge, Mask, VL);
}
};
@@ -13221,15 +13293,16 @@ struct CombineResult {
static std::optional<CombineResult>
canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS,
const NodeExtensionHelper &RHS, bool AllowSExt,
- bool AllowZExt) {
+ bool AllowZExt, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
assert((AllowSExt || AllowZExt) && "Forgot to set what you want?");
- if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root))
+ if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) ||
+ !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget))
return std::nullopt;
if (AllowZExt && LHS.SupportsZExt && RHS.SupportsZExt)
return CombineResult(NodeExtensionHelper::getSameExtensionOpcode(
Root->getOpcode(), /*IsSExt=*/false),
- Root, LHS, /*SExtLHS=*/false, RHS,
- /*SExtRHS=*/false);
+ Root, LHS, /*SExtLHS=*/false, RHS, /*SExtRHS=*/false);
if (AllowSExt && LHS.SupportsSExt && RHS.SupportsSExt)
return CombineResult(NodeExtensionHelper::getSameExtensionOpcode(
Root->getOpcode(), /*IsSExt=*/true),
@@ -13246,9 +13319,10 @@ canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS,
/// can be used to apply the pattern.
static std::optional<CombineResult>
canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS,
- const NodeExtensionHelper &RHS) {
+ const NodeExtensionHelper &RHS, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true,
- /*AllowZExt=*/true);
+ /*AllowZExt=*/true, DAG, Subtarget);
}
/// Check if \p Root follows a pattern Root(LHS, ext(RHS))
@@ -13257,8 +13331,9 @@ canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS,
/// can be used to apply the pattern.
static std::optional<CombineResult>
canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS,
- const NodeExtensionHelper &RHS) {
- if (!RHS.areVLAndMaskCompatible(Root))
+ const NodeExtensionHelper &RHS, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ if (!RHS.areVLAndMaskCompatible(Root, DAG, Subtarget))
return std::nullopt;
// FIXME: Is it useful to form a vwadd.wx or vwsub.wx if it removes a scalar
@@ -13282,9 +13357,10 @@ canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS,
/// can be used to apply the pattern.
static std::optional<CombineResult>
canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS,
- const NodeExtensionHelper &RHS) {
+ const NodeExtensionHelper &RHS, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true,
- /*AllowZExt=*/false);
+ /*AllowZExt=*/false, DAG, Subtarget);
}
/// Check if \p Root follows a pattern Root(zext(LHS), zext(RHS))
@@ -13293,9 +13369,10 @@ canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS,
/// can be used to apply the pattern.
static std::optional<CombineResult>
canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS,
- const NodeExtensionHelper &RHS) {
+ const NodeExtensionHelper &RHS, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/false,
- /*AllowZExt=*/true);
+ /*AllowZExt=*/true, DAG, Subtarget);
}
/// Check if \p Root follows a pattern Root(sext(LHS), zext(RHS))
@@ -13304,10 +13381,13 @@ canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS,
/// can be used to apply the pattern.
static std::optional<CombineResult>
canFoldToVW_SU(SDNode *Root, const NodeExtensionHelper &LHS,
- const NodeExtensionHelper &RHS) {
+ const NodeExtensionHelper &RHS, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+
if (!LHS.SupportsSExt || !RHS.SupportsZExt)
return std::nullopt;
- if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root))
+ if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) ||
+ !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget))
return std::nullopt;
return CombineResult(NodeExtensionHelper::getSUOpcode(Root->getOpcode()),
Root, LHS, /*SExtLHS=*/true, RHS, /*SExtRHS=*/false);
@@ -13317,6 +13397,8 @@ SmallVector<NodeExtensionHelper::CombineToTry>
NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
SmallVector<CombineToTry> Strategies;
switch (Root->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
case RISCVISD::ADD_VL:
case RISCVISD::SUB_VL:
// add|sub -> vwadd(u)|vwsub(u)
@@ -13324,6 +13406,7 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
// add|sub -> vwadd(u)_w|vwsub(u)_w
Strategies.push_back(canFoldToVW_W);
break;
+ case ISD::MUL:
case RISCVISD::MUL_VL:
// mul -> vwmul(u)
Strategies.push_back(canFoldToVWWithSameExtension);
@@ -13354,12 +13437,14 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
/// mul_vl -> vwmul(u) | vwmul_su
/// vwadd_w(u) -> vwadd(u)
/// vwub_w(u) -> vwadd(u)
-static SDValue
-combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const RISCVSubtarget &Subtarget) {
SelectionDAG &DAG = DCI.DAG;
- assert(NodeExtensionHelper::isSupportedRoot(N) &&
- "Shouldn't have called this method");
+ if (!NodeExtensionHelper::isSupportedRoot(N, DAG))
+ return SDValue();
+
SmallVector<SDNode *> Worklist;
SmallSet<SDNode *, 8> Inserted;
Worklist.push_back(N);
@@ -13368,11 +13453,11 @@ combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
while (!Worklist.empty()) {
SDNode *Root = Worklist.pop_back_val();
- if (!NodeExtensionHelper::isSupportedRoot(Root))
+ if (!NodeExtensionHelper::isSupportedRoot(Root, DAG))
return SDValue();
- NodeExtensionHelper LHS(N, 0, DAG);
- NodeExtensionHelper RHS(N, 1, DAG);
+ NodeExtensionHelper LHS(N, 0, DAG, Subtarget);
+ NodeExtensionHelper RHS(N, 1, DAG, Subtarget);
auto AppendUsersIfNeeded = [&Worklist,
&Inserted](const NodeExtensionHelper &Op) {
if (Op.needToPromoteOtherUsers()) {
@@ -13399,7 +13484,8 @@ combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
for (NodeExtensionHelper::CombineToTry FoldingStrategy :
FoldingStrategies) {
- std::optional<CombineResult> Res = FoldingStrategy(N, LHS, RHS);
+ std::optional<CombineResult> Res =
+ FoldingStrategy(N, LHS, RHS, DAG, Subtarget);
if (Res) {
Matched = true;
CombinesToApply.push_back(*Res);
@@ -13428,7 +13514,7 @@ combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
SmallVector<std::pair<SDValue, SDValue>> ValuesToReplace;
ValuesToReplace.reserve(CombinesToApply.size());
for (CombineResult Res : CombinesToApply) {
- SDValue NewValue = Res.materialize(DAG);
+ SDValue NewValue = Res.materialize(DAG, Subtarget);
if (!InputRootReplacement) {
assert(Res.Root == N &&
"First element is expected to be the current node");
@@ -14700,13 +14786,20 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
- assert(N->getOpcode() == RISCVISD::ADD_VL);
+
+ assert(N->getOpcode() == RISCVISD::ADD_VL || N->getOpcode() == ISD::ADD);
+
+ if (N->getValueType(0).isFixedLengthVector())
+ return SDValue();
+
SDValue Addend = N->getOperand(0);
SDValue MulOp = N->getOperand(1);
- SDValue AddMergeOp = N->getOperand(2);
- if (!AddMergeOp.isUndef())
- return SDValue();
+ if (N->getOpcode() == RISCVISD::ADD_VL) {
+ SDValue AddMergeOp = N->getOperand(2);
+ if (!AddMergeOp.isUndef())
+ return SDValue();
+ }
auto IsVWMulOpc = [](unsigned Opc) {
switch (Opc) {
@@ -14730,8 +14823,16 @@ static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
if (!MulMergeOp.isUndef())
return SDValue();
- SDValue AddMask = N->getOperand(3);
- SDValue AddVL = N->getOperand(4);
+ auto [AddMask, AddVL] = [](SDNode *N, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ if (N->getOpcode() == ISD::ADD) {
+ SDLoc DL(N);
+ return getDefaultScalableVLOps(N->getSimpleValueType(0), DL, DAG,
+ Subtarget);
+ }
+ return std::make_pair(N->getOperand(3), N->getOperand(4));
+ }(N, DAG, Subtarget);
+
SDValue MulMask = MulOp.getOperand(3);
SDValue MulVL = MulOp.getOperand(4);
@@ -14997,10 +15098,18 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return DAG.getNode(ISD::AND, DL, VT, NewFMV,
DAG.getConstant(~SignBit, DL, VT));
}
- case ISD::ADD:
+ case ISD::ADD: {
+ if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+ return V;
+ if (SDValue V = combineToVWMACC(N, DAG, Subtarget))
+ return V;
return performADDCombine(N, DAG, Subtarget);
- case ISD::SUB:
+ }
+ case ISD::SUB: {
+ if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+ return V;
return performSUBCombine(N, DAG, Subtarget);
+ }
case ISD::AND:
return performANDCombine(N, DCI, Subtarget);
case ISD::OR:
@@ -15008,6 +15117,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::XOR:
return performXORCombine(N, DAG, Subtarget);
case ISD::MUL:
+ if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+ return V;
return performMULCombine(N, DAG);
case ISD::FADD:
case ISD::UMAX:
@@ -15484,7 +15595,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case RISCVISD::ADD_VL:
- if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI))
+ if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
return V;
return combineToVWMACC(N, DAG, Subtarget);
case RISCVISD::SUB_VL:
@@ -15493,7 +15604,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case RISCVISD::VWSUB_W_VL:
case RISCVISD::VWSUBU_W_VL:
case RISCVISD::MUL_VL:
- return combineBinOp_VLToVWBinOp_VL(N, DCI);
+ return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget);
case RISCVISD::VFMADD_VL:
case RISCVISD::VFNMADD_VL:
case RISCVISD::VFMSUB_VL:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index 47d65c2593a4cc..fc94f8c2a52797 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -1231,16 +1231,17 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
;
; CHECK-F-LABEL: ctlz_nxv1i64:
; CHECK-F: # %bb.0:
-; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT: li a0, 190
+; CHECK-F-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-F-NEXT: vmv.v.x v9, a0
+; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-F-NEXT: fsrmi a0, 1
-; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8
-; CHECK-F-NEXT: vsrl.vi v8, v9, 23
-; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-F-NEXT: vzext.vf2 v9, v8
-; CHECK-F-NEXT: li a1, 190
-; CHECK-F-NEXT: vrsub.vx v8, v9, a1
+; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8
+; CHECK-F-NEXT: vsrl.vi v8, v10, 23
+; CHECK-F-NEXT: vwsubu.wv v9, v9, v8
; CHECK-F-NEXT: li a1, 64
-; CHECK-F-NEXT: vminu.vx v8, v8, a1
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT: vminu.vx v8, v9, a1
; CHECK-F-NEXT: fsrm a0
; CHECK-F-NEXT: ret
;
@@ -1371,16 +1372,17 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
;
; CHECK-F-LABEL: ctlz_nxv2i64:
; CHECK-F: # %bb.0:
-; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT: li a0, 190
+; CHECK-F-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-F-NEXT: vmv.v.x v10, a0
+; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-F-NEXT: fsrmi a0, 1
-; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8
-; CHECK-F-NEXT: vsrl.vi v8, v10, 23
-; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-F-NEXT: vzext.vf2 v10, v8
-; CHECK-F-NEXT: li a1, 190
-; CHECK-F-NEXT: vrsub.vx v8, v10, a1
+; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8
+; CHECK-F-NEXT: vsrl.vi v8, v12, 23
+; CHECK-F-NEXT: vwsubu.wv v10, v10, v8
; CHECK-F-NEXT: li a1, 64
-; CHECK-F-NEXT: vminu.vx v8, v8, a1
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT: vminu.vx v8, v10, a1
; CHECK-F-NEXT: fsrm a0
; CHECK-F-NEXT: ret
;
@@ -1511,16 +1513,17 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
;
; CHECK-F-LABEL: ctlz_nxv4i64:
; CHECK-F: # %bb.0:
-; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT: li a0, 190
+; CHECK-F-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-F-NEXT: vmv.v.x v12, a0
+; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-F-NEXT: fsrmi a0, 1
-; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8
-; CHECK-F-NEXT: vsrl.vi v8, v12, 23
-; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-F-NEXT: vzext.vf2 v12, v8
-; CHECK-F-NEXT: li a1, 190
-; CHECK-F-NEXT: vrsub.vx v8, v12, a1
+; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8
+; CHECK-F-NEXT: vsrl.vi v8, v16, 23
+; CHECK-F-NEXT: vwsubu.wv v12, v12, v8
; CHECK-F-NEXT: li a1, 64
-; CHECK-F-NEXT: vminu.vx v8, v8, a1
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT: vminu.vx v8, v12, a1
; CHECK-F-NEXT: fsrm a0
; CHECK-F-NEXT: ret
;
@@ -1651,16 +1654,17 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
;
; CHECK-F-LABEL: ctlz_nxv8i64:
; CHECK-F: # %bb.0:
-; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT: li a0, 190
+; CHECK-F-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-F-NEXT: vmv.v.x v16, a0
+; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-F-NEXT: fsrmi a0, 1
-; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8
-; CHECK-F-NEXT: vsrl.vi v8, v16, 23
-; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-F-NEXT: vzext.vf2 v16, v8
-; CHECK-F-NEXT: li a1, 190
-; CHECK-F-NEXT: vrsub.vx v8, v16, a1
+; CHECK-F-NEXT: vfncvt.f.xu.w v24, v8
+; CHECK-F-NEXT: vsrl.vi v8, v24, 23
+; CHECK-F-NEXT: vwsubu.wv v16, v16, v8
; CHECK-F-NEXT: li a1, 64
-; CHECK-F-NEXT: vminu.vx v8, v8, a1
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT: vminu.vx v8, v16, a1
; CHECK-F-NEXT: fsrm a0
; CHECK-F-NEXT: ret
;
@@ -2833,15 +2837,16 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
;
; CHECK-F-LABEL: ctlz_zero_undef_nxv1i64:
; CHECK-F: # %bb.0:
-; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT: li a0, 190
+; CHECK-F-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-F-NEXT: vmv.v.x v9, a0
+; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-F-NEXT: fsrmi a0, 1
-; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8
-; CHECK-F-NEXT: vsrl.vi v8, v9, 23
-; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-F-NEXT: vzext.vf2 v9, v8
-; CHECK-F-NEXT: li a1, 190
-; CHECK-F-NEXT: vrsub.vx v8, v9, a1
+; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8
+; CHECK-F-NEXT: vsrl.vi v8, v10, 23
+; CHECK-F-NEXT: vwsubu.wv v9, v9, v8
; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: vmv1r.v v8, v9
; CHECK-F-NEXT: ret
;
; CHECK-D-LABEL: ctlz_zero_undef_nxv1i64:
@@ -2968,15 +2973,16 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
;
; CHECK-F-LABEL: ctlz_zero_undef_nxv2i64:
; CHECK-F: # %bb.0:
-; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT: li a0, 190
+; CHECK-F-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-F-NEXT: vmv.v.x v10, a0
+; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-F-NEXT: fsrmi a0, 1
-; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8
-; CHECK-F-NEXT: vsrl.vi v8, v10, 23
-; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-F-NEXT: vzext.vf2 v10, v8
-; CHECK-F-NEXT: li a1, 190
-; CHECK-F-NEXT: vrsub.vx v8, v10, a1
+; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8
+; CHECK-F-NEXT: vsrl.vi v8, v12, 23
+; CHECK-F-NEXT: vwsubu.wv v10, v10, v8
; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: vmv2r.v v8, v10
; CHECK-F-NEXT: ret
;
; CHECK-D-LABEL: ctlz_zero_undef_nxv2i64:
@@ -3103,15 +3109,16 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
;
; CHECK-F-LABEL: ctlz_zero_undef_nxv4i64:
; CHECK-F: # %bb.0:
-; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT: li a0, 190
+; CHECK-F-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-F-NEXT: vmv.v.x v12, a0
+; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-F-NEXT: fsrmi a0, 1
-; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8
-; CHECK-F-NEXT: vsrl.vi v8, v12, 23
-; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-F-NEXT: vzext.vf2 v12, v8
-; CHECK-F-NEXT: li a1, 190
-; CHECK-F-NEXT: vrsub.vx v8, v12, a1
+; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8
+; CHECK-F-NEXT: vsrl.vi v8, v16, 23
+; CHECK-F-NEXT: vwsubu.wv v12, v12, v8
; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: vmv4r.v v8, v12
; CHECK-F-NEXT: ret
;
; CHECK-D-LABEL: ctlz_zero_undef_nxv4i64:
@@ -3238,14 +3245,15 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
;
; CHECK-F-LABEL: ctlz_zero_undef_nxv8i64:
; CHECK-F: # %bb.0:
-; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT: vmv8r.v v16, v8
+; CHECK-F-NEXT: li a0, 190
+; CHECK-F-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-F-NEXT: vmv.v.x v8, a0
+; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-F-NEXT: fsrmi a0, 1
-; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8
-; CHECK-F-NEXT: vsrl.vi v8, v16, 23
-; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-F-NEXT: vzext.vf2 v16, v8
-; CHECK-F-NEXT: li a1, 190
-; CHECK-F-NEXT: vrsub.vx v8, v16, a1
+; CHECK-F-NEXT: vfncvt.f.xu.w v24, v16
+; CHECK-F-NEXT: vsrl.vi v16, v24, 23
+; CHECK-F-NEXT: vwsubu.wv v8, v8, v16
; CHECK-F-NEXT: fsrm a0
; CHECK-F-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index d99e3a7fe690aa..458930abca6aae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -1,25 +1,23 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING,RV32
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING,RV32
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING,RV64
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING,RV64
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,RV64
; Check that the default value enables the web folding and
; that it is bigger than 3.
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING
-
-; FIXME: We should use vwadd/vwsub/vwmul instructions.
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING,RV64
; Check that the scalable vector add/sub/mul operations are all promoted into their
; vw counterpart when the folding of the web size is increased to 3.
; We need the web size to be at least 3 for the folding to happen, because
; %c has 3 uses.
; see https://github.com/llvm/llvm-project/pull/72340
-; FIXME: We don't currently use widening instructions.
-define <vscale x 2 x i16> @vwop_vscale_sext_multiple_users(ptr %x, ptr %y, ptr %z) {
-; NO_FOLDING-LABEL: vwop_vscale_sext_multiple_users:
+
+define <vscale x 2 x i16> @vwop_vscale_sext_i8i16_multiple_users(ptr %x, ptr %y, ptr %z) {
+; NO_FOLDING-LABEL: vwop_vscale_sext_i8i16_multiple_users:
; NO_FOLDING: # %bb.0:
; NO_FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
; NO_FOLDING-NEXT: vle8.v v8, (a0)
@@ -35,20 +33,18 @@ define <vscale x 2 x i16> @vwop_vscale_sext_multiple_users(ptr %x, ptr %y, ptr %
; NO_FOLDING-NEXT: vor.vv v8, v8, v9
; NO_FOLDING-NEXT: ret
;
-; FOLDING-LABEL: vwop_vscale_sext_multiple_users:
+; FOLDING-LABEL: vwop_vscale_sext_i8i16_multiple_users:
; FOLDING: # %bb.0:
-; FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; FOLDING-NEXT: vle8.v v8, (a0)
; FOLDING-NEXT: vle8.v v9, (a1)
; FOLDING-NEXT: vle8.v v10, (a2)
-; FOLDING-NEXT: vsext.vf2 v11, v8
-; FOLDING-NEXT: vsext.vf2 v8, v9
-; FOLDING-NEXT: vsext.vf2 v9, v10
-; FOLDING-NEXT: vmul.vv v8, v11, v8
-; FOLDING-NEXT: vadd.vv v10, v11, v9
-; FOLDING-NEXT: vsub.vv v9, v11, v9
-; FOLDING-NEXT: vor.vv v8, v8, v10
-; FOLDING-NEXT: vor.vv v8, v8, v9
+; FOLDING-NEXT: vwmul.vv v11, v8, v9
+; FOLDING-NEXT: vwadd.vv v9, v8, v10
+; FOLDING-NEXT: vwsub.vv v12, v8, v10
+; FOLDING-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; FOLDING-NEXT: vor.vv v8, v11, v9
+; FOLDING-NEXT: vor.vv v8, v8, v12
; FOLDING-NEXT: ret
%a = load <vscale x 2 x i8>, ptr %x
%b = load <vscale x 2 x i8>, ptr %y
@@ -64,18 +60,162 @@ define <vscale x 2 x i16> @vwop_vscale_sext_multiple_users(ptr %x, ptr %y, ptr %
ret <vscale x 2 x i16> %i
}
+define <vscale x 2 x i32> @vwop_vscale_sext_i16i32_multiple_users(ptr %x, ptr %y, ptr %z) {
+; NO_FOLDING-LABEL: vwop_vscale_sext_i16i32_multiple_users:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; NO_FOLDING-NEXT: vle16.v v8, (a0)
+; NO_FOLDING-NEXT: vle16.v v9, (a1)
+; NO_FOLDING-NEXT: vle16.v v10, (a2)
+; NO_FOLDING-NEXT: vsext.vf2 v11, v8
+; NO_FOLDING-NEXT: vsext.vf2 v8, v9
+; NO_FOLDING-NEXT: vsext.vf2 v9, v10
+; NO_FOLDING-NEXT: vmul.vv v8, v11, v8
+; NO_FOLDING-NEXT: vadd.vv v10, v11, v9
+; NO_FOLDING-NEXT: vsub.vv v9, v11, v9
+; NO_FOLDING-NEXT: vor.vv v8, v8, v10
+; NO_FOLDING-NEXT: vor.vv v8, v8, v9
+; NO_FOLDING-NEXT: ret
+;
+; FOLDING-LABEL: vwop_vscale_sext_i16i32_multiple_users:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; FOLDING-NEXT: vle16.v v8, (a0)
+; FOLDING-NEXT: vle16.v v9, (a1)
+; FOLDING-NEXT: vle16.v v10, (a2)
+; FOLDING-NEXT: vwmul.vv v11, v8, v9
+; FOLDING-NEXT: vwadd.vv v9, v8, v10
+; FOLDING-NEXT: vwsub.vv v12, v8, v10
+; FOLDING-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; FOLDING-NEXT: vor.vv v8, v11, v9
+; FOLDING-NEXT: vor.vv v8, v8, v12
+; FOLDING-NEXT: ret
+ %a = load <vscale x 2 x i16>, ptr %x
+ %b = load <vscale x 2 x i16>, ptr %y
+ %b2 = load <vscale x 2 x i16>, ptr %z
+ %c = sext <vscale x 2 x i16> %a to <vscale x 2 x i32>
+ %d = sext <vscale x 2 x i16> %b to <vscale x 2 x i32>
+ %d2 = sext <vscale x 2 x i16> %b2 to <vscale x 2 x i32>
+ %e = mul <vscale x 2 x i32> %c, %d
+ %f = add <vscale x 2 x i32> %c, %d2
+ %g = sub <vscale x 2 x i32> %c, %d2
+ %h = or <vscale x 2 x i32> %e, %f
+ %i = or <vscale x 2 x i32> %h, %g
+ ret <vscale x 2 x i32> %i
+}
+define <vscale x 2 x i64> @vwop_vscale_sext_i32i64_multiple_users(ptr %x, ptr %y, ptr %z) {
+; NO_FOLDING-LABEL: vwop_vscale_sext_i32i64_multiple_users:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vl1re32.v v8, (a0)
+; NO_FOLDING-NEXT: vl1re32.v v9, (a1)
+; NO_FOLDING-NEXT: vl1re32.v v10, (a2)
+; NO_FOLDING-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; NO_FOLDING-NEXT: vsext.vf2 v12, v8
+; NO_FOLDING-NEXT: vsext.vf2 v14, v9
+; NO_FOLDING-NEXT: vsext.vf2 v8, v10
+; NO_FOLDING-NEXT: vmul.vv v10, v12, v14
+; NO_FOLDING-NEXT: vadd.vv v14, v12, v8
+; NO_FOLDING-NEXT: vsub.vv v8, v12, v8
+; NO_FOLDING-NEXT: vor.vv v10, v10, v14
+; NO_FOLDING-NEXT: vor.vv v8, v10, v8
+; NO_FOLDING-NEXT: ret
+;
+; FOLDING-LABEL: vwop_vscale_sext_i32i64_multiple_users:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vl1re32.v v8, (a0)
+; FOLDING-NEXT: vl1re32.v v9, (a1)
+; FOLDING-NEXT: vl1re32.v v10, (a2)
+; FOLDING-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; FOLDING-NEXT: vwmul.vv v12, v8, v9
+; FOLDING-NEXT: vwadd.vv v14, v8, v10
+; FOLDING-NEXT: vwsub.vv v16, v8, v10
+; FOLDING-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; FOLDING-NEXT: vor.vv v8, v12, v14
+; FOLDING-NEXT: vor.vv v8, v8, v16
+; FOLDING-NEXT: ret
+ %a = load <vscale x 2 x i32>, ptr %x
+ %b = load <vscale x 2 x i32>, ptr %y
+ %b2 = load <vscale x 2 x i32>, ptr %z
+ %c = sext <vscale x 2 x i32> %a to <vscale x 2 x i64>
+ %d = sext <vscale x 2 x i32> %b to <vscale x 2 x i64>
+ %d2 = sext <vscale x 2 x i32> %b2 to <vscale x 2 x i64>
+ %e = mul <vscale x 2 x i64> %c, %d
+ %f = add <vscale x 2 x i64> %c, %d2
+ %g = sub <vscale x 2 x i64> %c, %d2
+ %h = or <vscale x 2 x i64> %e, %f
+ %i = or <vscale x 2 x i64> %h, %g
+ ret <vscale x 2 x i64> %i
+}
-define <vscale x 2 x i16> @vwop_vscale_zext_multiple_users(ptr %x, ptr %y, ptr %z) {
-; NO_FOLDING-LABEL: vwop_vscale_zext_multiple_users:
+define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y, ptr %z) {
+; RV32-LABEL: vwop_vscale_sext_i1i32_multiple_users:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, mu
+; RV32-NEXT: vlm.v v8, (a0)
+; RV32-NEXT: vlm.v v9, (a1)
+; RV32-NEXT: vlm.v v10, (a2)
+; RV32-NEXT: vmv.v.i v11, 0
+; RV32-NEXT: vmv.v.v v0, v8
+; RV32-NEXT: vmerge.vim v12, v11, -1, v0
+; RV32-NEXT: vmv.v.v v0, v9
+; RV32-NEXT: vmerge.vim v9, v11, -1, v0
+; RV32-NEXT: vmv.v.v v0, v10
+; RV32-NEXT: vmerge.vim v10, v11, -1, v0
+; RV32-NEXT: vmul.vv v9, v12, v9
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsub.vv v11, v12, v10
+; RV32-NEXT: vmv.v.v v0, v8
+; RV32-NEXT: vsub.vx v10, v10, a0, v0.t
+; RV32-NEXT: vor.vv v8, v9, v10
+; RV32-NEXT: vor.vv v8, v8, v11
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwop_vscale_sext_i1i32_multiple_users:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV64-NEXT: vlm.v v8, (a0)
+; RV64-NEXT: vlm.v v9, (a1)
+; RV64-NEXT: vlm.v v10, (a2)
+; RV64-NEXT: vmv.v.i v11, 0
+; RV64-NEXT: vmv.v.v v0, v8
+; RV64-NEXT: vmerge.vim v12, v11, -1, v0
+; RV64-NEXT: vmv.v.v v0, v9
+; RV64-NEXT: vmerge.vim v9, v11, -1, v0
+; RV64-NEXT: vmv.v.v v0, v10
+; RV64-NEXT: vmerge.vim v10, v11, -1, v0
+; RV64-NEXT: vmul.vv v9, v12, v9
+; RV64-NEXT: vmv.v.v v0, v8
+; RV64-NEXT: vmerge.vim v8, v11, 1, v0
+; RV64-NEXT: vsub.vv v8, v10, v8
+; RV64-NEXT: vsub.vv v10, v12, v10
+; RV64-NEXT: vor.vv v8, v9, v8
+; RV64-NEXT: vor.vv v8, v8, v10
+; RV64-NEXT: ret
+ %a = load <vscale x 2 x i1>, ptr %x
+ %b = load <vscale x 2 x i1>, ptr %y
+ %b2 = load <vscale x 2 x i1>, ptr %z
+ %c = sext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+ %d = sext <vscale x 2 x i1> %b to <vscale x 2 x i32>
+ %d2 = sext <vscale x 2 x i1> %b2 to <vscale x 2 x i32>
+ %e = mul <vscale x 2 x i32> %c, %d
+ %f = add <vscale x 2 x i32> %c, %d2
+ %g = sub <vscale x 2 x i32> %c, %d2
+ %h = or <vscale x 2 x i32> %e, %f
+ %i = or <vscale x 2 x i32> %h, %g
+ ret <vscale x 2 x i32> %i
+}
+
+define <vscale x 2 x i32> @vwop_vscale_sext_i8i32_multiple_users(ptr %x, ptr %y, ptr %z) {
+; NO_FOLDING-LABEL: vwop_vscale_sext_i8i32_multiple_users:
; NO_FOLDING: # %bb.0:
-; NO_FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; NO_FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, ma
; NO_FOLDING-NEXT: vle8.v v8, (a0)
; NO_FOLDING-NEXT: vle8.v v9, (a1)
; NO_FOLDING-NEXT: vle8.v v10, (a2)
-; NO_FOLDING-NEXT: vzext.vf2 v11, v8
-; NO_FOLDING-NEXT: vzext.vf2 v8, v9
-; NO_FOLDING-NEXT: vzext.vf2 v9, v10
+; NO_FOLDING-NEXT: vsext.vf4 v11, v8
+; NO_FOLDING-NEXT: vsext.vf4 v8, v9
+; NO_FOLDING-NEXT: vsext.vf4 v9, v10
; NO_FOLDING-NEXT: vmul.vv v8, v11, v8
; NO_FOLDING-NEXT: vadd.vv v10, v11, v9
; NO_FOLDING-NEXT: vsub.vv v9, v11, v9
@@ -83,20 +223,64 @@ define <vscale x 2 x i16> @vwop_vscale_zext_multiple_users(ptr %x, ptr %y, ptr %
; NO_FOLDING-NEXT: vor.vv v8, v8, v9
; NO_FOLDING-NEXT: ret
;
-; FOLDING-LABEL: vwop_vscale_zext_multiple_users:
+; FOLDING-LABEL: vwop_vscale_sext_i8i32_multiple_users:
; FOLDING: # %bb.0:
-; FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, ma
; FOLDING-NEXT: vle8.v v8, (a0)
; FOLDING-NEXT: vle8.v v9, (a1)
; FOLDING-NEXT: vle8.v v10, (a2)
-; FOLDING-NEXT: vzext.vf2 v11, v8
-; FOLDING-NEXT: vzext.vf2 v8, v9
-; FOLDING-NEXT: vzext.vf2 v9, v10
+; FOLDING-NEXT: vsext.vf4 v11, v8
+; FOLDING-NEXT: vsext.vf4 v8, v9
+; FOLDING-NEXT: vsext.vf4 v9, v10
; FOLDING-NEXT: vmul.vv v8, v11, v8
; FOLDING-NEXT: vadd.vv v10, v11, v9
; FOLDING-NEXT: vsub.vv v9, v11, v9
; FOLDING-NEXT: vor.vv v8, v8, v10
; FOLDING-NEXT: vor.vv v8, v8, v9
+; FOLDING-NEXT: ret
+ %a = load <vscale x 2 x i8>, ptr %x
+ %b = load <vscale x 2 x i8>, ptr %y
+ %b2 = load <vscale x 2 x i8>, ptr %z
+ %c = sext <vscale x 2 x i8> %a to <vscale x 2 x i32>
+ %d = sext <vscale x 2 x i8> %b to <vscale x 2 x i32>
+ %d2 = sext <vscale x 2 x i8> %b2 to <vscale x 2 x i32>
+ %e = mul <vscale x 2 x i32> %c, %d
+ %f = add <vscale x 2 x i32> %c, %d2
+ %g = sub <vscale x 2 x i32> %c, %d2
+ %h = or <vscale x 2 x i32> %e, %f
+ %i = or <vscale x 2 x i32> %h, %g
+ ret <vscale x 2 x i32> %i
+}
+
+define <vscale x 2 x i16> @vwop_vscale_zext_i8i16_multiple_users(ptr %x, ptr %y, ptr %z) {
+; NO_FOLDING-LABEL: vwop_vscale_zext_i8i16_multiple_users:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; NO_FOLDING-NEXT: vle8.v v8, (a0)
+; NO_FOLDING-NEXT: vle8.v v9, (a1)
+; NO_FOLDING-NEXT: vle8.v v10, (a2)
+; NO_FOLDING-NEXT: vzext.vf2 v11, v8
+; NO_FOLDING-NEXT: vzext.vf2 v8, v9
+; NO_FOLDING-NEXT: vzext.vf2 v9, v10
+; NO_FOLDING-NEXT: vmul.vv v8, v11, v8
+; NO_FOLDING-NEXT: vadd.vv v10, v11, v9
+; NO_FOLDING-NEXT: vsub.vv v9, v11, v9
+; NO_FOLDING-NEXT: vor.vv v8, v8, v10
+; NO_FOLDING-NEXT: vor.vv v8, v8, v9
+; NO_FOLDING-NEXT: ret
+;
+; FOLDING-LABEL: vwop_vscale_zext_i8i16_multiple_users:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT: vle8.v v8, (a0)
+; FOLDING-NEXT: vle8.v v9, (a1)
+; FOLDING-NEXT: vle8.v v10, (a2)
+; FOLDING-NEXT: vwmulu.vv v11, v8, v9
+; FOLDING-NEXT: vwaddu.vv v9, v8, v10
+; FOLDING-NEXT: vwsubu.vv v12, v8, v10
+; FOLDING-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; FOLDING-NEXT: vor.vv v8, v11, v9
+; FOLDING-NEXT: vor.vv v8, v8, v12
; FOLDING-NEXT: ret
%a = load <vscale x 2 x i8>, ptr %x
%b = load <vscale x 2 x i8>, ptr %y
@@ -111,3 +295,187 @@ define <vscale x 2 x i16> @vwop_vscale_zext_multiple_users(ptr %x, ptr %y, ptr %
%i = or <vscale x 2 x i16> %h, %g
ret <vscale x 2 x i16> %i
}
+
+define <vscale x 2 x i32> @vwop_vscale_zext_i16i32_multiple_users(ptr %x, ptr %y, ptr %z) {
+; NO_FOLDING-LABEL: vwop_vscale_zext_i16i32_multiple_users:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; NO_FOLDING-NEXT: vle16.v v8, (a0)
+; NO_FOLDING-NEXT: vle16.v v9, (a1)
+; NO_FOLDING-NEXT: vle16.v v10, (a2)
+; NO_FOLDING-NEXT: vzext.vf2 v11, v8
+; NO_FOLDING-NEXT: vzext.vf2 v8, v9
+; NO_FOLDING-NEXT: vzext.vf2 v9, v10
+; NO_FOLDING-NEXT: vmul.vv v8, v11, v8
+; NO_FOLDING-NEXT: vadd.vv v10, v11, v9
+; NO_FOLDING-NEXT: vsub.vv v9, v11, v9
+; NO_FOLDING-NEXT: vor.vv v8, v8, v10
+; NO_FOLDING-NEXT: vor.vv v8, v8, v9
+; NO_FOLDING-NEXT: ret
+;
+; FOLDING-LABEL: vwop_vscale_zext_i16i32_multiple_users:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; FOLDING-NEXT: vle16.v v8, (a0)
+; FOLDING-NEXT: vle16.v v9, (a1)
+; FOLDING-NEXT: vle16.v v10, (a2)
+; FOLDING-NEXT: vwmulu.vv v11, v8, v9
+; FOLDING-NEXT: vwaddu.vv v9, v8, v10
+; FOLDING-NEXT: vwsubu.vv v12, v8, v10
+; FOLDING-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; FOLDING-NEXT: vor.vv v8, v11, v9
+; FOLDING-NEXT: vor.vv v8, v8, v12
+; FOLDING-NEXT: ret
+ %a = load <vscale x 2 x i16>, ptr %x
+ %b = load <vscale x 2 x i16>, ptr %y
+ %b2 = load <vscale x 2 x i16>, ptr %z
+ %c = zext <vscale x 2 x i16> %a to <vscale x 2 x i32>
+ %d = zext <vscale x 2 x i16> %b to <vscale x 2 x i32>
+ %d2 = zext <vscale x 2 x i16> %b2 to <vscale x 2 x i32>
+ %e = mul <vscale x 2 x i32> %c, %d
+ %f = add <vscale x 2 x i32> %c, %d2
+ %g = sub <vscale x 2 x i32> %c, %d2
+ %h = or <vscale x 2 x i32> %e, %f
+ %i = or <vscale x 2 x i32> %h, %g
+ ret <vscale x 2 x i32> %i
+}
+
+define <vscale x 2 x i64> @vwop_vscale_zext_i32i64_multiple_users(ptr %x, ptr %y, ptr %z) {
+; NO_FOLDING-LABEL: vwop_vscale_zext_i32i64_multiple_users:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vl1re32.v v8, (a0)
+; NO_FOLDING-NEXT: vl1re32.v v9, (a1)
+; NO_FOLDING-NEXT: vl1re32.v v10, (a2)
+; NO_FOLDING-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; NO_FOLDING-NEXT: vzext.vf2 v12, v8
+; NO_FOLDING-NEXT: vzext.vf2 v14, v9
+; NO_FOLDING-NEXT: vzext.vf2 v8, v10
+; NO_FOLDING-NEXT: vmul.vv v10, v12, v14
+; NO_FOLDING-NEXT: vadd.vv v14, v12, v8
+; NO_FOLDING-NEXT: vsub.vv v8, v12, v8
+; NO_FOLDING-NEXT: vor.vv v10, v10, v14
+; NO_FOLDING-NEXT: vor.vv v8, v10, v8
+; NO_FOLDING-NEXT: ret
+;
+; FOLDING-LABEL: vwop_vscale_zext_i32i64_multiple_users:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vl1re32.v v8, (a0)
+; FOLDING-NEXT: vl1re32.v v9, (a1)
+; FOLDING-NEXT: vl1re32.v v10, (a2)
+; FOLDING-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; FOLDING-NEXT: vwmulu.vv v12, v8, v9
+; FOLDING-NEXT: vwaddu.vv v14, v8, v10
+; FOLDING-NEXT: vwsubu.vv v16, v8, v10
+; FOLDING-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; FOLDING-NEXT: vor.vv v8, v12, v14
+; FOLDING-NEXT: vor.vv v8, v8, v16
+; FOLDING-NEXT: ret
+ %a = load <vscale x 2 x i32>, ptr %x
+ %b = load <vscale x 2 x i32>, ptr %y
+ %b2 = load <vscale x 2 x i32>, ptr %z
+ %c = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
+ %d = zext <vscale x 2 x i32> %b to <vscale x 2 x i64>
+ %d2 = zext <vscale x 2 x i32> %b2 to <vscale x 2 x i64>
+ %e = mul <vscale x 2 x i64> %c, %d
+ %f = add <vscale x 2 x i64> %c, %d2
+ %g = sub <vscale x 2 x i64> %c, %d2
+ %h = or <vscale x 2 x i64> %e, %f
+ %i = or <vscale x 2 x i64> %h, %g
+ ret <vscale x 2 x i64> %i
+}
+
+define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y, ptr %z) {
+; RV32-LABEL: vwop_vscale_zext_i1i32_multiple_users:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, mu
+; RV32-NEXT: vlm.v v0, (a0)
+; RV32-NEXT: vlm.v v8, (a2)
+; RV32-NEXT: vlm.v v9, (a1)
+; RV32-NEXT: vmv.v.i v10, 0
+; RV32-NEXT: vmerge.vim v11, v10, 1, v0
+; RV32-NEXT: vmv.v.v v0, v8
+; RV32-NEXT: vmerge.vim v8, v10, 1, v0
+; RV32-NEXT: vadd.vv v10, v11, v8
+; RV32-NEXT: vsub.vv v8, v11, v8
+; RV32-NEXT: vmv.v.v v0, v9
+; RV32-NEXT: vor.vv v10, v10, v11, v0.t
+; RV32-NEXT: vor.vv v8, v10, v8
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwop_vscale_zext_i1i32_multiple_users:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV64-NEXT: vlm.v v0, (a0)
+; RV64-NEXT: vlm.v v8, (a1)
+; RV64-NEXT: vlm.v v9, (a2)
+; RV64-NEXT: vmv.v.i v10, 0
+; RV64-NEXT: vmerge.vim v11, v10, 1, v0
+; RV64-NEXT: vmv.v.v v0, v8
+; RV64-NEXT: vmerge.vim v8, v10, 1, v0
+; RV64-NEXT: vmv.v.v v0, v9
+; RV64-NEXT: vmerge.vim v9, v10, 1, v0
+; RV64-NEXT: vmul.vv v8, v11, v8
+; RV64-NEXT: vadd.vv v10, v11, v9
+; RV64-NEXT: vsub.vv v9, v11, v9
+; RV64-NEXT: vor.vv v8, v8, v10
+; RV64-NEXT: vor.vv v8, v8, v9
+; RV64-NEXT: ret
+ %a = load <vscale x 2 x i1>, ptr %x
+ %b = load <vscale x 2 x i1>, ptr %y
+ %b2 = load <vscale x 2 x i1>, ptr %z
+ %c = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+ %d = zext <vscale x 2 x i1> %b to <vscale x 2 x i32>
+ %d2 = zext <vscale x 2 x i1> %b2 to <vscale x 2 x i32>
+ %e = mul <vscale x 2 x i32> %c, %d
+ %f = add <vscale x 2 x i32> %c, %d2
+ %g = sub <vscale x 2 x i32> %c, %d2
+ %h = or <vscale x 2 x i32> %e, %f
+ %i = or <vscale x 2 x i32> %h, %g
+ ret <vscale x 2 x i32> %i
+}
+
+define <vscale x 2 x i32> @vwop_vscale_zext_i8i32_multiple_users(ptr %x, ptr %y, ptr %z) {
+; NO_FOLDING-LABEL: vwop_vscale_zext_i8i32_multiple_users:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; NO_FOLDING-NEXT: vle8.v v8, (a0)
+; NO_FOLDING-NEXT: vle8.v v9, (a1)
+; NO_FOLDING-NEXT: vle8.v v10, (a2)
+; NO_FOLDING-NEXT: vzext.vf4 v11, v8
+; NO_FOLDING-NEXT: vzext.vf4 v8, v9
+; NO_FOLDING-NEXT: vzext.vf4 v9, v10
+; NO_FOLDING-NEXT: vmul.vv v8, v11, v8
+; NO_FOLDING-NEXT: vadd.vv v10, v11, v9
+; NO_FOLDING-NEXT: vsub.vv v9, v11, v9
+; NO_FOLDING-NEXT: vor.vv v8, v8, v10
+; NO_FOLDING-NEXT: vor.vv v8, v8, v9
+; NO_FOLDING-NEXT: ret
+;
+; FOLDING-LABEL: vwop_vscale_zext_i8i32_multiple_users:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; FOLDING-NEXT: vle8.v v8, (a0)
+; FOLDING-NEXT: vle8.v v9, (a1)
+; FOLDING-NEXT: vle8.v v10, (a2)
+; FOLDING-NEXT: vzext.vf4 v11, v8
+; FOLDING-NEXT: vzext.vf4 v8, v9
+; FOLDING-NEXT: vzext.vf4 v9, v10
+; FOLDING-NEXT: vmul.vv v8, v11, v8
+; FOLDING-NEXT: vadd.vv v10, v11, v9
+; FOLDING-NEXT: vsub.vv v9, v11, v9
+; FOLDING-NEXT: vor.vv v8, v8, v10
+; FOLDING-NEXT: vor.vv v8, v8, v9
+; FOLDING-NEXT: ret
+ %a = load <vscale x 2 x i8>, ptr %x
+ %b = load <vscale x 2 x i8>, ptr %y
+ %b2 = load <vscale x 2 x i8>, ptr %z
+ %c = zext <vscale x 2 x i8> %a to <vscale x 2 x i32>
+ %d = zext <vscale x 2 x i8> %b to <vscale x 2 x i32>
+ %d2 = zext <vscale x 2 x i8> %b2 to <vscale x 2 x i32>
+ %e = mul <vscale x 2 x i32> %c, %d
+ %f = add <vscale x 2 x i32> %c, %d2
+ %g = sub <vscale x 2 x i32> %c, %d2
+ %h = or <vscale x 2 x i32> %e, %f
+ %i = or <vscale x 2 x i32> %h, %g
+ ret <vscale x 2 x i32> %i
+}
>From 20f10afc530cd2d8a2b5bee7b8e1e56da0fb9b4d Mon Sep 17 00:00:00 2001
From: sun-jacobi <sun1011jacobi at gmail.com>
Date: Thu, 4 Jan 2024 18:07:25 +0900
Subject: [PATCH 2/2] [RISCV][ISel] Ensure the narrowing element type is legal.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 7 +-
.../RISCV/rvv/vscale-vw-web-simplification.ll | 113 ++++++++++++++++++
2 files changed, 119 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2fb79c81b7f169..a35258f7628c0f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -13039,11 +13039,16 @@ struct NodeExtensionHelper {
if (!VT.isVector())
break;
- MVT NarrowVT = OrigOperand.getOperand(0)->getSimpleValueType(0);
+ SDValue NarrowElt = OrigOperand.getOperand(0);
+ MVT NarrowVT = NarrowElt.getSimpleValueType();
unsigned ScalarBits = VT.getScalarSizeInBits();
unsigned NarrowScalarBits = NarrowVT.getScalarSizeInBits();
+ // Ensure the narrowing element type is legal
+ if (!Subtarget.getTargetLowering()->isTypeLegal(NarrowElt.getValueType()))
+ break;
+
// Ensure the extension's semantic is equivalent to rvv vzext or vsext.
if (ScalarBits != NarrowScalarBits * 2)
break;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index 458930abca6aae..972fa66917a568 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -206,6 +206,64 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
ret <vscale x 2 x i32> %i
}
+define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, ptr %z) {
+; NO_FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
+; NO_FOLDING-NEXT: vlm.v v8, (a0)
+; NO_FOLDING-NEXT: vlm.v v9, (a1)
+; NO_FOLDING-NEXT: vlm.v v10, (a2)
+; NO_FOLDING-NEXT: vmv.v.i v11, 0
+; NO_FOLDING-NEXT: vmv1r.v v0, v8
+; NO_FOLDING-NEXT: vmerge.vim v12, v11, -1, v0
+; NO_FOLDING-NEXT: vmv1r.v v0, v9
+; NO_FOLDING-NEXT: vmerge.vim v9, v11, -1, v0
+; NO_FOLDING-NEXT: vmv1r.v v0, v10
+; NO_FOLDING-NEXT: vmerge.vim v10, v11, -1, v0
+; NO_FOLDING-NEXT: vmul.vv v9, v12, v9
+; NO_FOLDING-NEXT: vmv1r.v v0, v8
+; NO_FOLDING-NEXT: vmerge.vim v8, v11, 1, v0
+; NO_FOLDING-NEXT: vsub.vv v8, v10, v8
+; NO_FOLDING-NEXT: vsub.vv v10, v12, v10
+; NO_FOLDING-NEXT: vor.vv v8, v9, v8
+; NO_FOLDING-NEXT: vor.vv v8, v8, v10
+; NO_FOLDING-NEXT: ret
+;
+; FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT: vlm.v v8, (a0)
+; FOLDING-NEXT: vlm.v v9, (a1)
+; FOLDING-NEXT: vlm.v v10, (a2)
+; FOLDING-NEXT: vmv.v.i v11, 0
+; FOLDING-NEXT: vmv1r.v v0, v8
+; FOLDING-NEXT: vmerge.vim v12, v11, -1, v0
+; FOLDING-NEXT: vmv1r.v v0, v9
+; FOLDING-NEXT: vmerge.vim v9, v11, -1, v0
+; FOLDING-NEXT: vmv1r.v v0, v10
+; FOLDING-NEXT: vmerge.vim v10, v11, -1, v0
+; FOLDING-NEXT: vmul.vv v9, v12, v9
+; FOLDING-NEXT: vmv1r.v v0, v8
+; FOLDING-NEXT: vmerge.vim v8, v11, 1, v0
+; FOLDING-NEXT: vsub.vv v8, v10, v8
+; FOLDING-NEXT: vsub.vv v10, v12, v10
+; FOLDING-NEXT: vor.vv v8, v9, v8
+; FOLDING-NEXT: vor.vv v8, v8, v10
+; FOLDING-NEXT: ret
+ %a = load <vscale x 2 x i1>, ptr %x
+ %b = load <vscale x 2 x i1>, ptr %y
+ %b2 = load <vscale x 2 x i1>, ptr %z
+ %c = sext <vscale x 2 x i1> %a to <vscale x 2 x i8>
+ %d = sext <vscale x 2 x i1> %b to <vscale x 2 x i8>
+ %d2 = sext <vscale x 2 x i1> %b2 to <vscale x 2 x i8>
+ %e = mul <vscale x 2 x i8> %c, %d
+ %f = add <vscale x 2 x i8> %c, %d2
+ %g = sub <vscale x 2 x i8> %c, %d2
+ %h = or <vscale x 2 x i8> %e, %f
+ %i = or <vscale x 2 x i8> %h, %g
+ ret <vscale x 2 x i8> %i
+}
+
define <vscale x 2 x i32> @vwop_vscale_sext_i8i32_multiple_users(ptr %x, ptr %y, ptr %z) {
; NO_FOLDING-LABEL: vwop_vscale_sext_i8i32_multiple_users:
; NO_FOLDING: # %bb.0:
@@ -434,6 +492,58 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y,
ret <vscale x 2 x i32> %i
}
+define <vscale x 2 x i8> @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, ptr %z) {
+; NO_FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users:
+; NO_FOLDING: # %bb.0:
+; NO_FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
+; NO_FOLDING-NEXT: vlm.v v0, (a0)
+; NO_FOLDING-NEXT: vlm.v v8, (a1)
+; NO_FOLDING-NEXT: vlm.v v9, (a2)
+; NO_FOLDING-NEXT: vmv.v.i v10, 0
+; NO_FOLDING-NEXT: vmerge.vim v11, v10, 1, v0
+; NO_FOLDING-NEXT: vmv1r.v v0, v8
+; NO_FOLDING-NEXT: vmerge.vim v8, v10, 1, v0
+; NO_FOLDING-NEXT: vmv1r.v v0, v9
+; NO_FOLDING-NEXT: vmerge.vim v9, v10, 1, v0
+; NO_FOLDING-NEXT: vmul.vv v8, v11, v8
+; NO_FOLDING-NEXT: vadd.vv v10, v11, v9
+; NO_FOLDING-NEXT: vsub.vv v9, v11, v9
+; NO_FOLDING-NEXT: vor.vv v8, v8, v10
+; NO_FOLDING-NEXT: vor.vv v8, v8, v9
+; NO_FOLDING-NEXT: ret
+;
+; FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users:
+; FOLDING: # %bb.0:
+; FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT: vlm.v v0, (a0)
+; FOLDING-NEXT: vlm.v v8, (a1)
+; FOLDING-NEXT: vlm.v v9, (a2)
+; FOLDING-NEXT: vmv.v.i v10, 0
+; FOLDING-NEXT: vmerge.vim v11, v10, 1, v0
+; FOLDING-NEXT: vmv1r.v v0, v8
+; FOLDING-NEXT: vmerge.vim v8, v10, 1, v0
+; FOLDING-NEXT: vmv1r.v v0, v9
+; FOLDING-NEXT: vmerge.vim v9, v10, 1, v0
+; FOLDING-NEXT: vmul.vv v8, v11, v8
+; FOLDING-NEXT: vadd.vv v10, v11, v9
+; FOLDING-NEXT: vsub.vv v9, v11, v9
+; FOLDING-NEXT: vor.vv v8, v8, v10
+; FOLDING-NEXT: vor.vv v8, v8, v9
+; FOLDING-NEXT: ret
+ %a = load <vscale x 2 x i1>, ptr %x
+ %b = load <vscale x 2 x i1>, ptr %y
+ %b2 = load <vscale x 2 x i1>, ptr %z
+ %c = zext <vscale x 2 x i1> %a to <vscale x 2 x i8>
+ %d = zext <vscale x 2 x i1> %b to <vscale x 2 x i8>
+ %d2 = zext <vscale x 2 x i1> %b2 to <vscale x 2 x i8>
+ %e = mul <vscale x 2 x i8> %c, %d
+ %f = add <vscale x 2 x i8> %c, %d2
+ %g = sub <vscale x 2 x i8> %c, %d2
+ %h = or <vscale x 2 x i8> %e, %f
+ %i = or <vscale x 2 x i8> %h, %g
+ ret <vscale x 2 x i8> %i
+}
+
define <vscale x 2 x i32> @vwop_vscale_zext_i8i32_multiple_users(ptr %x, ptr %y, ptr %z) {
; NO_FOLDING-LABEL: vwop_vscale_zext_i8i32_multiple_users:
; NO_FOLDING: # %bb.0:
@@ -479,3 +589,6 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i8i32_multiple_users(ptr %x, ptr %y,
%i = or <vscale x 2 x i32> %h, %g
ret <vscale x 2 x i32> %i
}
+
+
+
More information about the llvm-commits
mailing list