[llvm] [RFC][LV] Add support for speculative loads in loops that may fault (PR #151300)
Shih-Po Hung via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 4 01:37:31 PDT 2025
https://github.com/arcbbb updated https://github.com/llvm/llvm-project/pull/151300
>From 52fa48fde1c8981114494321ce428a8871dec06e Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Wed, 12 Mar 2025 01:18:53 -0700
Subject: [PATCH 1/8] Cherry-pick vp.load.ff intrinsic support
---
llvm/docs/LangRef.rst | 57 +
llvm/include/llvm/CodeGen/SelectionDAG.h | 3 +
llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 17 +
llvm/include/llvm/IR/Intrinsics.td | 6 +
llvm/include/llvm/IR/VPIntrinsics.def | 6 +
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 +
.../SelectionDAG/LegalizeVectorTypes.cpp | 74 ++
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 36 +
.../SelectionDAG/SelectionDAGBuilder.cpp | 32 +
.../SelectionDAG/SelectionDAGBuilder.h | 2 +
llvm/lib/IR/IntrinsicInst.cpp | 5 +
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 58 +
llvm/lib/Target/RISCV/RISCVISelLowering.h | 1 +
.../RISCV/rvv/fixed-vectors-vploadff.ll | 586 ++++++++++
llvm/test/CodeGen/RISCV/rvv/vploadff.ll | 1008 +++++++++++++++++
llvm/unittests/IR/VPIntrinsicTest.cpp | 2 +
16 files changed, 1895 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll
create mode 100644 llvm/test/CodeGen/RISCV/rvv/vploadff.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 28746bf9d05aa..8ba985b17a255 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -24243,6 +24243,63 @@ Examples:
%also.r = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %ptr, i32 2, <8 x i1> %mask, <8 x i8> poison)
+.. _int_vp_ff_load:
+
+'``llvm.vp.ff.load``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare {<4 x float>, i32} @llvm.vp.load.ff.v4f32.p0(ptr %ptr, <4 x i1> %mask, i32 %evl)
+ declare {<vscale x 2 x i16>, i32} @llvm.vp.load.ff.nxv2i16.p0(ptr %ptr, <vscale x 2 x i1> %mask, i32 %evl)
+ declare {<8 x float>, i32} @llvm.vp.load.ff.v8f32.p1(ptr addrspace(1) %ptr, <8 x i1> %mask, i32 %evl)
+ declare {<vscale x 1 x i64>, i32} @llvm.vp.load.ff.nxv1i64.p6(ptr addrspace(6) %ptr, <vscale x 1 x i1> %mask, i32 %evl)
+
+Overview:
+"""""""""
+
+The '``llvm.vp.load.ff.*``' intrinsic is similar to '``llvm.vp.load.*``', but
+will not trap if there are not ``evl`` readable elements at the pointer.
+
+Arguments:
+""""""""""
+
+The first argument is the base pointer for the load. The second argument is a
+vector of boolean values with the same number of elements as the first return
+type. The third is the explicit vector length of the operation. The first
+return type and underlying type of the base pointer are the same vector types.
+
+The :ref:`align <attr_align>` parameter attribute can be provided for the first
+argument.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.load.ff``' intrinsic reads a vector from memory similar to
+'``llvm.vp.load``, but will only trap if the first lane is unreadable. If
+any other lane is unreadable, the number of successfully read lanes will
+be returned in the second return value. The result in the first return value
+for the lanes that were not successfully read is
+:ref:`poison value <poisonvalues>`. If ``evl`` is 0, no read occurs and thus no
+trap can occur for the first lane. If ``mask`` is 0 for the first lane, no
+trap occurs. This intrinsic is allowed to read fewer than ``evl`` lanes even
+if no trap would occur. If ``evl`` is non-zero, the result in the second result
+must be at least 1 even if the first lane is disabled by ``mask``.
+
+The default alignment is taken as the ABI alignment of the first return
+type as specified by the :ref:`datalayout string<langref_datalayout>`.
+
+Examples:
+"""""""""
+
+.. code-block:: text
+
+ %r = call {<8 x i8>, i32} @llvm.vp.load.ff.v8i8.p0(ptr align 2 %ptr, <8 x i1> %mask, i32 %evl)
+
.. _int_vp_store:
'``llvm.vp.store``' Intrinsic
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index e5644a5ef206a..af895aa84aa89 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1668,6 +1668,9 @@ class SelectionDAG {
ArrayRef<SDValue> Ops,
MachineMemOperand *MMO,
ISD::MemIndexType IndexType);
+ LLVM_ABI SDValue getLoadFFVP(EVT VT, const SDLoc &dl, SDValue Chain,
+ SDValue Ptr, SDValue Mask, SDValue EVL,
+ MachineMemOperand *MMO);
LLVM_ABI SDValue getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr,
EVT MemVT, MachineMemOperand *MMO);
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 8f88811be9c01..7fcf66c2c03f1 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -3099,6 +3099,23 @@ class MaskedHistogramSDNode : public MaskedGatherScatterSDNode {
}
};
+class VPLoadFFSDNode : public MemSDNode {
+public:
+ friend class SelectionDAG;
+
+ VPLoadFFSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+ MachineMemOperand *MMO)
+ : MemSDNode(ISD::VP_LOAD_FF, Order, dl, VTs, MemVT, MMO) {}
+
+ const SDValue &getBasePtr() const { return getOperand(1); }
+ const SDValue &getMask() const { return getOperand(2); }
+ const SDValue &getVectorLength() const { return getOperand(3); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == ISD::VP_LOAD_FF;
+ }
+};
+
class FPStateAccessSDNode : public MemSDNode {
public:
friend class SelectionDAG;
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index bd6f94ac1286c..1a23276bc6ffb 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1932,6 +1932,12 @@ def int_vp_load : DefaultAttrsIntrinsic<[ llvm_anyvector_ty],
llvm_i32_ty],
[ NoCapture<ArgIndex<0>>, IntrReadMem, IntrArgMemOnly ]>;
+def int_vp_load_ff : DefaultAttrsIntrinsic<[ llvm_anyvector_ty, llvm_i32_ty ],
+ [ llvm_anyptr_ty,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ llvm_i32_ty],
+ [ NoCapture<ArgIndex<0>>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>;
+
def int_vp_gather: DefaultAttrsIntrinsic<[ llvm_anyvector_ty],
[ LLVMVectorOfAnyPointersToElt<0>,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 55f4719da7c8b..4a71097226f18 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -587,6 +587,12 @@ VP_PROPERTY_FUNCTIONAL_OPC(Load)
VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_load)
END_REGISTER_VP(vp_load, VP_LOAD)
+BEGIN_REGISTER_VP_INTRINSIC(vp_load_ff, 1, 2)
+// val,chain = VP_LOAD_FF chain,base,mask,evl
+BEGIN_REGISTER_VP_SDNODE(VP_LOAD_FF, -1, vp_load_ff, 2, 3)
+HELPER_MAP_VPID_TO_VPSD(vp_load_ff, VP_LOAD_FF)
+VP_PROPERTY_NO_FUNCTIONAL
+END_REGISTER_VP(vp_load_ff, VP_LOAD_FF)
// llvm.experimental.vp.strided.load(ptr,stride,mask,vlen)
BEGIN_REGISTER_VP_INTRINSIC(experimental_vp_strided_load, 2, 3)
// chain = EXPERIMENTAL_VP_STRIDED_LOAD chain,base,offset,stride,mask,evl
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 2e13b1854bf29..63544e63e1da1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -971,6 +971,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
+ void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue &Lo,
SDValue &Hi);
void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi);
@@ -1075,6 +1076,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
SDValue WidenVecRes_LOAD(SDNode* N);
SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N);
+ SDValue WidenVecRes_VP_LOAD_FF(VPLoadFFSDNode *N);
SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N);
SDValue WidenVecRes_VECTOR_COMPRESS(SDNode *N);
SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1661814d5a897..aadda5cd3ba7d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1152,6 +1152,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::VP_LOAD:
SplitVecRes_VP_LOAD(cast<VPLoadSDNode>(N), Lo, Hi);
break;
+ case ISD::VP_LOAD_FF:
+ SplitVecRes_VP_LOAD_FF(cast<VPLoadFFSDNode>(N), Lo, Hi);
+ break;
case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
SplitVecRes_VP_STRIDED_LOAD(cast<VPStridedLoadSDNode>(N), Lo, Hi);
break;
@@ -2227,6 +2230,51 @@ void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo,
ReplaceValueWith(SDValue(LD, 1), Ch);
}
+void DAGTypeLegalizer::SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo,
+ SDValue &Hi) {
+ EVT LoVT, HiVT;
+ SDLoc dl(LD);
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0));
+
+ SDValue Ch = LD->getChain();
+ SDValue Ptr = LD->getBasePtr();
+ Align Alignment = LD->getBaseAlign();
+ SDValue Mask = LD->getMask();
+ SDValue EVL = LD->getVectorLength();
+ EVT MemoryVT = LD->getMemoryVT();
+
+ bool HiIsEmpty = false;
+ auto [LoMemVT, HiMemVT] =
+ DAG.GetDependentSplitDestVTs(MemoryVT, LoVT, &HiIsEmpty);
+
+ // Split Mask operand
+ SDValue MaskLo, MaskHi;
+ if (Mask.getOpcode() == ISD::SETCC) {
+ SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+ } else {
+ if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Mask, MaskLo, MaskHi);
+ else
+ std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+ }
+
+ // Split EVL operand
+ auto [EVLLo, EVLHi] = DAG.SplitEVL(EVL, LD->getValueType(0), dl);
+
+ MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+ LD->getPointerInfo(), MachineMemOperand::MOLoad,
+ LocationSize::beforeOrAfterPointer(), Alignment, LD->getAAInfo(),
+ LD->getRanges());
+
+ Lo = DAG.getLoadFFVP(LoVT, dl, Ch, Ptr, MaskLo, EVLLo, MMO);
+
+ // Fill the upper half with poison.
+ Hi = DAG.getUNDEF(HiVT);
+
+ ReplaceValueWith(SDValue(LD, 1), Lo.getValue(1));
+ ReplaceValueWith(SDValue(LD, 2), Lo.getValue(2));
+}
+
void DAGTypeLegalizer::SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD,
SDValue &Lo, SDValue &Hi) {
assert(SLD->isUnindexed() &&
@@ -4707,6 +4755,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
case ISD::VP_LOAD:
Res = WidenVecRes_VP_LOAD(cast<VPLoadSDNode>(N));
break;
+ case ISD::VP_LOAD_FF:
+ Res = WidenVecRes_VP_LOAD_FF(cast<VPLoadFFSDNode>(N));
+ break;
case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
Res = WidenVecRes_VP_STRIDED_LOAD(cast<VPStridedLoadSDNode>(N));
break;
@@ -6163,6 +6214,29 @@ SDValue DAGTypeLegalizer::WidenVecRes_VP_LOAD(VPLoadSDNode *N) {
return Res;
}
+SDValue DAGTypeLegalizer::WidenVecRes_VP_LOAD_FF(VPLoadFFSDNode *N) {
+ EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ SDValue Mask = N->getMask();
+ SDValue EVL = N->getVectorLength();
+ SDLoc dl(N);
+
+ // The mask should be widened as well
+ assert(getTypeAction(Mask.getValueType()) ==
+ TargetLowering::TypeWidenVector &&
+ "Unable to widen binary VP op");
+ Mask = GetWidenedVector(Mask);
+ assert(Mask.getValueType().getVectorElementCount() ==
+ TLI.getTypeToTransformTo(*DAG.getContext(), Mask.getValueType())
+ .getVectorElementCount() &&
+ "Unable to widen vector load");
+
+ SDValue Res = DAG.getLoadFFVP(WidenVT, dl, N->getChain(), N->getBasePtr(),
+ Mask, EVL, N->getMemOperand());
+ ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+ ReplaceValueWith(SDValue(N, 2), Res.getValue(2));
+ return Res;
+}
+
SDValue DAGTypeLegalizer::WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N) {
SDLoc DL(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5c586f73aa125..8acdb1b93faff 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -837,6 +837,14 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
ID.AddInteger(ELD->getMemOperand()->getFlags());
break;
}
+ case ISD::VP_LOAD_FF: {
+ const VPLoadFFSDNode *LD = cast<VPLoadFFSDNode>(N);
+ ID.AddInteger(LD->getMemoryVT().getRawBits());
+ ID.AddInteger(LD->getRawSubclassData());
+ ID.AddInteger(LD->getPointerInfo().getAddrSpace());
+ ID.AddInteger(LD->getMemOperand()->getFlags());
+ break;
+ }
case ISD::VP_STORE: {
const VPStoreSDNode *EST = cast<VPStoreSDNode>(N);
ID.AddInteger(EST->getMemoryVT().getRawBits());
@@ -10393,6 +10401,34 @@ SDValue SelectionDAG::getMaskedHistogram(SDVTList VTs, EVT MemVT,
return V;
}
+SDValue SelectionDAG::getLoadFFVP(EVT VT, const SDLoc &dl, SDValue Chain,
+ SDValue Ptr, SDValue Mask, SDValue EVL,
+ MachineMemOperand *MMO) {
+ SDVTList VTs = getVTList(VT, EVL.getValueType(), MVT::Other);
+ SDValue Ops[] = {Chain, Ptr, Mask, EVL};
+ FoldingSetNodeID ID;
+ AddNodeIDNode(ID, ISD::VP_LOAD_FF, VTs, Ops);
+ ID.AddInteger(VT.getRawBits());
+ ID.AddInteger(getSyntheticNodeSubclassData<VPLoadFFSDNode>(dl.getIROrder(),
+ VTs, VT, MMO));
+ ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+ ID.AddInteger(MMO->getFlags());
+ void *IP = nullptr;
+ if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+ cast<VPLoadFFSDNode>(E)->refineAlignment(MMO);
+ return SDValue(E, 0);
+ }
+ auto *N = newSDNode<VPLoadFFSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+ VT, MMO);
+ createOperands(N, Ops);
+
+ CSEMap.InsertNode(N, IP);
+ InsertNode(N);
+ SDValue V(N, 0);
+ NewSDValueDbgMsg(V, "Creating new node: ", this);
+ return V;
+}
+
SDValue SelectionDAG::getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr,
EVT MemVT, MachineMemOperand *MMO) {
assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 306e068f1c1da..719ce43ce4d02 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8440,6 +8440,35 @@ void SelectionDAGBuilder::visitVPLoad(
setValue(&VPIntrin, LD);
}
+void SelectionDAGBuilder::visitVPLoadFF(
+ const VPIntrinsic &VPIntrin, EVT VT, EVT EVLVT,
+ const SmallVectorImpl<SDValue> &OpValues) {
+ assert(OpValues.size() == 3);
+ SDLoc DL = getCurSDLoc();
+ Value *PtrOperand = VPIntrin.getArgOperand(0);
+ MaybeAlign Alignment = VPIntrin.getPointerAlignment();
+ AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+ const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range);
+ SDValue LD;
+ bool AddToChain = true;
+ // Do not serialize variable-length loads of constant memory with
+ // anything.
+ if (!Alignment)
+ Alignment = DAG.getEVTAlign(VT);
+ MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
+ AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML);
+ SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
+ MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
+ LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
+ LD = DAG.getLoadFFVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2],
+ MMO);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, EVLVT, LD.getValue(1));
+ if (AddToChain)
+ PendingLoads.push_back(LD.getValue(2));
+ setValue(&VPIntrin, DAG.getMergeValues({LD.getValue(0), Trunc}, DL));
+}
+
void SelectionDAGBuilder::visitVPGather(
const VPIntrinsic &VPIntrin, EVT VT,
const SmallVectorImpl<SDValue> &OpValues) {
@@ -8673,6 +8702,9 @@ void SelectionDAGBuilder::visitVectorPredicationIntrinsic(
case ISD::VP_LOAD:
visitVPLoad(VPIntrin, ValueVTs[0], OpValues);
break;
+ case ISD::VP_LOAD_FF:
+ visitVPLoadFF(VPIntrin, ValueVTs[0], ValueVTs[1], OpValues);
+ break;
case ISD::VP_GATHER:
visitVPGather(VPIntrin, ValueVTs[0], OpValues);
break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 1c278076a219d..c251755ee7064 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -631,6 +631,8 @@ class SelectionDAGBuilder {
void visitVectorExtractLastActive(const CallInst &I, unsigned Intrinsic);
void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT,
const SmallVectorImpl<SDValue> &OpValues);
+ void visitVPLoadFF(const VPIntrinsic &VPIntrin, EVT VT, EVT EVLVT,
+ const SmallVectorImpl<SDValue> &OpValues);
void visitVPStore(const VPIntrinsic &VPIntrin,
const SmallVectorImpl<SDValue> &OpValues);
void visitVPGather(const VPIntrinsic &VPIntrin, EVT VT,
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index b1d3339c5a414..23a4d1b5c615e 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -448,6 +448,7 @@ VPIntrinsic::getMemoryPointerParamPos(Intrinsic::ID VPID) {
case Intrinsic::experimental_vp_strided_store:
return 1;
case Intrinsic::vp_load:
+ case Intrinsic::vp_load_ff:
case Intrinsic::vp_gather:
case Intrinsic::experimental_vp_strided_load:
return 0;
@@ -671,6 +672,10 @@ Function *VPIntrinsic::getOrInsertDeclarationForParams(
VPFunc = Intrinsic::getOrInsertDeclaration(
M, VPID, {ReturnType, Params[0]->getType()});
break;
+ case Intrinsic::vp_load_ff:
+ VPFunc = Intrinsic::getOrInsertDeclaration(
+ M, VPID, {ReturnType->getStructElementType(0), Params[0]->getType()});
+ break;
case Intrinsic::experimental_vp_strided_load:
VPFunc = Intrinsic::getOrInsertDeclaration(
M, VPID, {ReturnType, Params[0]->getType(), Params[1]->getType()});
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b47d89b42f533..5c87ceb2ffd72 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -927,6 +927,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
{ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER},
VT, Custom);
+ setOperationAction(ISD::VP_LOAD_FF, VT, Custom);
setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
ISD::EXTRACT_SUBVECTOR, ISD::SCALAR_TO_VECTOR},
@@ -1105,6 +1106,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
{ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER},
VT, Custom);
+ setOperationAction(ISD::VP_LOAD_FF, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -1181,6 +1183,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER,
ISD::VP_SCATTER},
VT, Custom);
+ setOperationAction(ISD::VP_LOAD_FF, VT, Custom);
setOperationAction(ISD::FNEG, VT, Expand);
setOperationAction(ISD::FABS, VT, Expand);
@@ -1352,6 +1355,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER,
ISD::VP_SCATTER},
VT, Custom);
+ setOperationAction(ISD::VP_LOAD_FF, VT, Custom);
setOperationAction({ISD::ADD, ISD::MUL, ISD::SUB, ISD::AND, ISD::OR,
ISD::XOR, ISD::SDIV, ISD::SREM, ISD::UDIV,
@@ -1442,6 +1446,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::VP_SCATTER, ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
ISD::EXPERIMENTAL_VP_STRIDED_STORE},
VT, Custom);
+ setOperationAction(ISD::VP_LOAD_FF, VT, Custom);
setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
@@ -8096,6 +8101,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::MLOAD:
case ISD::VP_LOAD:
return lowerMaskedLoad(Op, DAG);
+ case ISD::VP_LOAD_FF:
+ return lowerLoadFF(Op, DAG);
case ISD::MSTORE:
case ISD::VP_STORE:
return lowerMaskedStore(Op, DAG);
@@ -12682,6 +12689,57 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
return DAG.getMergeValues({Result, Chain}, DL);
}
+SDValue RISCVTargetLowering::lowerLoadFF(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getResNo() == 0);
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ const auto *VPLoadFF = cast<VPLoadFFSDNode>(Op);
+ EVT MemVT = VPLoadFF->getMemoryVT();
+ MachineMemOperand *MMO = VPLoadFF->getMemOperand();
+ SDValue Chain = VPLoadFF->getChain();
+ SDValue BasePtr = VPLoadFF->getBasePtr();
+
+ SDValue Mask = VPLoadFF->getMask();
+ SDValue VL = VPLoadFF->getVectorLength();
+
+ bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+
+ MVT XLenVT = Subtarget.getXLenVT();
+
+ MVT ContainerVT = VT;
+ if (VT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VT);
+ if (!IsUnmasked) {
+ MVT MaskVT = getMaskTypeFor(ContainerVT);
+ Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+ }
+ }
+
+ unsigned IntID =
+ IsUnmasked ? Intrinsic::riscv_vleff : Intrinsic::riscv_vleff_mask;
+ SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)};
+ Ops.push_back(DAG.getUNDEF(ContainerVT));
+ Ops.push_back(BasePtr);
+ if (!IsUnmasked)
+ Ops.push_back(Mask);
+ Ops.push_back(VL);
+ if (!IsUnmasked)
+ Ops.push_back(DAG.getTargetConstant(RISCVVType::TAIL_AGNOSTIC, DL, XLenVT));
+
+ SDVTList VTs = DAG.getVTList({ContainerVT, Op->getValueType(1), MVT::Other});
+
+ SDValue Result =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MemVT, MMO);
+ SDValue OutVL = Result.getValue(1);
+ Chain = Result.getValue(2);
+
+ if (VT.isFixedLengthVector())
+ Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
+
+ return DAG.getMergeValues({Result, OutVL, Chain}, DL);
+}
+
SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ca70c46988b4e..a4c353478a649 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -526,6 +526,7 @@ class RISCVTargetLowering : public TargetLowering {
SDValue lowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerMaskedLoad(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerLoadFF(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerMaskedStore(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVectorCompress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op,
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll
new file mode 100644
index 0000000000000..5b01976dbbebd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll
@@ -0,0 +1,586 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+
+define { <2 x i8>, i32 } @vploadff_v2i8(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i8>, i32 } @llvm.vp.load.ff.v2i8.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x i8>, i32 } %load
+}
+
+define { <2 x i8>, i32 } @vploadff_v2i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i8>, i32 } @llvm.vp.load.ff.v2i8.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x i8>, i32 } %load
+}
+
+define { <4 x i8>, i32 } @vploadff_v4i8(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i8>, i32 } @llvm.vp.load.ff.v4i8.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x i8>, i32 } %load
+}
+
+define { <4 x i8>, i32 } @vploadff_v4i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i8>, i32 } @llvm.vp.load.ff.v4i8.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x i8>, i32 } %load
+}
+
+define { <8 x i8>, i32 } @vploadff_v8i8(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i8>, i32 } @llvm.vp.load.ff.v8i8.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x i8>, i32 } %load
+}
+
+define { <8 x i8>, i32 } @vploadff_v8i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i8>, i32 } @llvm.vp.load.ff.v8i8.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x i8>, i32 } %load
+}
+
+define { <2 x i16>, i32 } @vploadff_v2i16(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i16>, i32 } @llvm.vp.load.ff.v2i16.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x i16>, i32 } %load
+}
+
+define { <2 x i16>, i32 } @vploadff_v2i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i16>, i32 } @llvm.vp.load.ff.v2i16.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x i16>, i32 } %load
+}
+
+define { <4 x i16>, i32 } @vploadff_v4i16(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i16>, i32 } @llvm.vp.load.ff.v4i16.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x i16>, i32 } %load
+}
+
+define { <4 x i16>, i32 } @vploadff_v4i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i16>, i32 } @llvm.vp.load.ff.v4i16.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x i16>, i32 } %load
+}
+
+define { <8 x i16>, i32 } @vploadff_v8i16(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i16>, i32 } @llvm.vp.load.ff.v8i16.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x i16>, i32 } %load
+}
+
+define { <8 x i16>, i32 } @vploadff_v8i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i16>, i32 } @llvm.vp.load.ff.v8i16.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x i16>, i32 } %load
+}
+
+define { <2 x i32>, i32 } @vploadff_v2i32(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i32>, i32 } @llvm.vp.load.ff.v2i32.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x i32>, i32 } %load
+}
+
+define { <2 x i32>, i32 } @vploadff_v2i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i32>, i32 } @llvm.vp.load.ff.v2i32.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x i32>, i32 } %load
+}
+
+define { <4 x i32>, i32 } @vploadff_v4i32(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i32>, i32 } @llvm.vp.load.ff.v4i32.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x i32>, i32 } %load
+}
+
+define { <4 x i32>, i32 } @vploadff_v4i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i32>, i32 } @llvm.vp.load.ff.v4i32.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x i32>, i32 } %load
+}
+
+define { <8 x i32>, i32 } @vploadff_v8i32(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i32>, i32 } @llvm.vp.load.ff.v8i32.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x i32>, i32 } %load
+}
+
+define { <8 x i32>, i32 } @vploadff_v8i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i32>, i32 } @llvm.vp.load.ff.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x i32>, i32 } %load
+}
+
+define { <2 x i64>, i32 } @vploadff_v2i64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x i64>, i32 } %load
+}
+
+define { <2 x i64>, i32 } @vploadff_v2i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x i64>, i32 } %load
+}
+
+define { <4 x i64>, i32 } @vploadff_v4i64(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i64>, i32 } @llvm.vp.load.ff.v4i64.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x i64>, i32 } %load
+}
+
+define { <4 x i64>, i32 } @vploadff_v4i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i64>, i32 } @llvm.vp.load.ff.v4i64.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x i64>, i32 } %load
+}
+
+define { <8 x i64>, i32 } @vploadff_v8i64(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i64>, i32 } @llvm.vp.load.ff.v8i64.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x i64>, i32 } %load
+}
+
+define { <8 x i64>, i32 } @vploadff_v8i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i64>, i32 } @llvm.vp.load.ff.v8i64.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x i64>, i32 } %load
+}
+
+define { <32 x i64>, i32 } @vploadff_v32i64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v32i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: bltu a2, a3, .LBB24_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: .LBB24_2:
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a1), v0.t
+; CHECK-NEXT: csrr a1, vl
+; CHECK-NEXT: sw a1, 256(a0)
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v8, (a0)
+; CHECK-NEXT: ret
+ %load = call { <32 x i64>, i32 } @llvm.vp.load.ff.v32i64.p0(ptr %ptr, <32 x i1> %m, i32 %evl)
+ ret { <32 x i64>, i32 } %load
+}
+
+define { <32 x i64>, i32 } @vploadff_v32i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v32i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: bltu a2, a3, .LBB25_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: .LBB25_2:
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a1)
+; CHECK-NEXT: csrr a1, vl
+; CHECK-NEXT: sw a1, 256(a0)
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v8, (a0)
+; CHECK-NEXT: ret
+ %load = call { <32 x i64>, i32 } @llvm.vp.load.ff.v32i64.p0(ptr %ptr, <32 x i1> splat (i1 true), i32 %evl)
+ ret { <32 x i64>, i32 } %load
+}
+
+define { <2 x half>, i32 } @vploadff_v2f16(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x half>, i32 } @llvm.vp.load.ff.v2f16.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x half>, i32 } %load
+}
+
+define { <2 x half>, i32 } @vploadff_v2f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x half>, i32 } @llvm.vp.load.ff.v2f16.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x half>, i32 } %load
+}
+
+define { <4 x half>, i32 } @vploadff_v4f16(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x half>, i32 } @llvm.vp.load.ff.v4f16.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x half>, i32 } %load
+}
+
+define { <4 x half>, i32 } @vploadff_v4f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x half>, i32 } @llvm.vp.load.ff.v4f16.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x half>, i32 } %load
+}
+
+define { <8 x half>, i32 } @vploadff_v8f16(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x half>, i32 } @llvm.vp.load.ff.v8f16.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x half>, i32 } %load
+}
+
+define { <8 x half>, i32 } @vploadff_v8f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x half>, i32 } @llvm.vp.load.ff.v8f16.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x half>, i32 } %load
+}
+
+define { <2 x float>, i32 } @vploadff_v2f32(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x float>, i32 } @llvm.vp.load.ff.v2f32.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x float>, i32 } %load
+}
+
+define { <2 x float>, i32 } @vploadff_v2f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x float>, i32 } @llvm.vp.load.ff.v2f32.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x float>, i32 } %load
+}
+
+define { <4 x float>, i32 } @vploadff_v4f32(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x float>, i32 } @llvm.vp.load.ff.v4f32.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x float>, i32 } %load
+}
+
+define { <4 x float>, i32 } @vploadff_v4f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x float>, i32 } @llvm.vp.load.ff.v4f32.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x float>, i32 } %load
+}
+
+define { <8 x float>, i32 } @vploadff_v8f32(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x float>, i32 } @llvm.vp.load.ff.v8f32.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x float>, i32 } %load
+}
+
+define { <8 x float>, i32 } @vploadff_v8f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x float>, i32 } @llvm.vp.load.ff.v8f32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x float>, i32 } %load
+}
+
+define { <2 x double>, i32 } @vploadff_v2f64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x double>, i32 } @llvm.vp.load.ff.v2f64.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x double>, i32 } %load
+}
+
+define { <2 x double>, i32 } @vploadff_v2f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x double>, i32 } @llvm.vp.load.ff.v2f64.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x double>, i32 } %load
+}
+
+define { <4 x double>, i32 } @vploadff_v4f64(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x double>, i32 } @llvm.vp.load.ff.v4f64.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x double>, i32 } %load
+}
+
+define { <4 x double>, i32 } @vploadff_v4f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x double>, i32 } @llvm.vp.load.ff.v4f64.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x double>, i32 } %load
+}
+
+define { <8 x double>, i32 } @vploadff_v8f64(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x double>, i32 } @llvm.vp.load.ff.v8f64.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x double>, i32 } %load
+}
+
+define { <8 x double>, i32 } @vploadff_v8f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x double>, i32 } @llvm.vp.load.ff.v8f64.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x double>, i32 } %load
+}
+
+define { <2 x bfloat>, i32 } @vploadff_v2bf16(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x bfloat>, i32 } @llvm.vp.load.ff.v2bf16.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x bfloat>, i32 } %load
+}
+
+define { <2 x bfloat>, i32 } @vploadff_v2bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x bfloat>, i32 } @llvm.vp.load.ff.v2bf16.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x bfloat>, i32 } %load
+}
+
+define { <4 x bfloat>, i32 } @vploadff_v4bf16(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x bfloat>, i32 } @llvm.vp.load.ff.v4bf16.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x bfloat>, i32 } %load
+}
+
+define { <4 x bfloat>, i32 } @vploadff_v4bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x bfloat>, i32 } @llvm.vp.load.ff.v4bf16.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x bfloat>, i32 } %load
+}
+
+define { <8 x bfloat>, i32 } @vploadff_v8bf16(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x bfloat>, i32 } @llvm.vp.load.ff.v8bf16.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x bfloat>, i32 } %load
+}
+
+define { <8 x bfloat>, i32 } @vploadff_v8bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x bfloat>, i32 } @llvm.vp.load.ff.v8bf16.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x bfloat>, i32 } %load
+}
+
+define { <7 x i8>, i32 } @vploadff_v7i8(ptr %ptr, <7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v7i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <7 x i8>, i32 } @llvm.vp.load.ff.v7i8.p0(ptr %ptr, <7 x i1> %m, i32 %evl)
+ ret { <7 x i8>, i32 } %load
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vploadff.ll b/llvm/test/CodeGen/RISCV/rvv/vploadff.ll
new file mode 100644
index 0000000000000..9e08938a9fe6c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vploadff.ll
@@ -0,0 +1,1008 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+
+define { <vscale x 1 x i8>, i32 } @vploadff_nxv1i8(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i8>, i32 } @llvm.vp.load.ff.nxv1i8.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x i8>, i32 } %load
+}
+
+define { <vscale x 1 x i8>, i32 } @vploadff_nxv1i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i8>, i32 } @llvm.vp.load.ff.nxv1i8.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x i8>, i32 } %load
+}
+
+define { <vscale x 2 x i8>, i32 } @vploadff_nxv2i8(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i8>, i32 } @llvm.vp.load.ff.nxv2i8.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x i8>, i32 } %load
+}
+
+define { <vscale x 2 x i8>, i32 } @vploadff_nxv2i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i8>, i32 } @llvm.vp.load.ff.nxv2i8.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x i8>, i32 } %load
+}
+
+define { <vscale x 4 x i8>, i32 } @vploadff_nxv4i8(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i8>, i32 } @llvm.vp.load.ff.nxv4i8.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x i8>, i32 } %load
+}
+
+define { <vscale x 4 x i8>, i32 } @vploadff_nxv4i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i8>, i32 } @llvm.vp.load.ff.nxv4i8.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x i8>, i32 } %load
+}
+
+define { <vscale x 8 x i8>, i32 } @vploadff_nxv8i8(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i8>, i32 } @llvm.vp.load.ff.nxv8i8.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x i8>, i32 } %load
+}
+
+define { <vscale x 8 x i8>, i32 } @vploadff_nxv8i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i8>, i32 } @llvm.vp.load.ff.nxv8i8.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x i8>, i32 } %load
+}
+
+define { <vscale x 16 x i8>, i32 } @vploadff_nxv16i8(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x i8>, i32 } @llvm.vp.load.ff.nxv16i8.p0(ptr %ptr, <vscale x 16 x i1> %m, i32 %evl)
+ ret { <vscale x 16 x i8>, i32 } %load
+}
+
+define { <vscale x 16 x i8>, i32 } @vploadff_nxv16i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x i8>, i32 } @llvm.vp.load.ff.nxv16i8.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 16 x i8>, i32 } %load
+}
+
+define { <vscale x 32 x i8>, i32 } @vploadff_nxv32i8(ptr %ptr, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x i8>, i32 } @llvm.vp.load.ff.nxv32i8.p0(ptr %ptr, <vscale x 32 x i1> %m, i32 %evl)
+ ret { <vscale x 32 x i8>, i32 } %load
+}
+
+define { <vscale x 32 x i8>, i32 } @vploadff_nxv32i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x i8>, i32 } @llvm.vp.load.ff.nxv32i8.p0(ptr %ptr, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 32 x i8>, i32 } %load
+}
+
+define { <vscale x 64 x i8>, i32 } @vploadff_nxv64i8(ptr %ptr, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv64i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 64 x i8>, i32 } @llvm.vp.load.ff.nxv64i8.p0(ptr %ptr, <vscale x 64 x i1> %m, i32 %evl)
+ ret { <vscale x 64 x i8>, i32 } %load
+}
+
+define { <vscale x 64 x i8>, i32 } @vploadff_nxv64i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv64i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 64 x i8>, i32 } @llvm.vp.load.ff.nxv64i8.p0(ptr %ptr, <vscale x 64 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 64 x i8>, i32 } %load
+}
+
+define <vscale x 128 x i8> @vploadff_nxv128i8(ptr %ptr, ptr %evl_out, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv128i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 3
+; CHECK-NEXT: bltu a2, a3, .LBB14_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB14_2:
+; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: sw a0, 0(a1)
+; CHECK-NEXT: ret
+ %load = call { <vscale x 128 x i8>, i32 } @llvm.vp.load.ff.nxv128i8.p0(ptr %ptr, <vscale x 128 x i1> %m, i32 %evl)
+ %result0 = extractvalue { <vscale x 128 x i8>, i32 } %load, 0
+ %result1 = extractvalue { <vscale x 128 x i8>, i32 } %load, 1
+ store i32 %result1, ptr %evl_out
+ ret <vscale x 128 x i8> %result0
+}
+
+define <vscale x 128 x i8> @vploadff_nxv128i8_allones_mask(ptr %ptr, ptr %evl_out, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv128i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 3
+; CHECK-NEXT: bltu a2, a3, .LBB15_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB15_2:
+; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: sw a0, 0(a1)
+; CHECK-NEXT: ret
+ %load = call { <vscale x 128 x i8>, i32 } @llvm.vp.load.ff.nxv128i8.p0(ptr %ptr, <vscale x 128 x i1> splat (i1 true), i32 %evl)
+ %result0 = extractvalue { <vscale x 128 x i8>, i32 } %load, 0
+ %result1 = extractvalue { <vscale x 128 x i8>, i32 } %load, 1
+ store i32 %result1, ptr %evl_out
+ ret <vscale x 128 x i8> %result0
+}
+
+define { <vscale x 1 x i16>, i32 } @vploadff_nxv1i16(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i16>, i32 } @llvm.vp.load.ff.nxv1i16.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x i16>, i32 } %load
+}
+
+define { <vscale x 1 x i16>, i32 } @vploadff_nxv1i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i16>, i32 } @llvm.vp.load.ff.nxv1i16.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x i16>, i32 } %load
+}
+
+define { <vscale x 2 x i16>, i32 } @vploadff_nxv2i16(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i16>, i32 } @llvm.vp.load.ff.nxv2i16.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x i16>, i32 } %load
+}
+
+define { <vscale x 2 x i16>, i32 } @vploadff_nxv2i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i16>, i32 } @llvm.vp.load.ff.nxv2i16.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x i16>, i32 } %load
+}
+
+define { <vscale x 4 x i16>, i32 } @vploadff_nxv4i16(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i16>, i32 } @llvm.vp.load.ff.nxv4i16.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x i16>, i32 } %load
+}
+
+define { <vscale x 4 x i16>, i32 } @vploadff_nxv4i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i16>, i32 } @llvm.vp.load.ff.nxv4i16.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x i16>, i32 } %load
+}
+
+define { <vscale x 8 x i16>, i32 } @vploadff_nxv8i16(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i16>, i32 } @llvm.vp.load.ff.nxv8i16.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x i16>, i32 } %load
+}
+
+define { <vscale x 8 x i16>, i32 } @vploadff_nxv8i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i16>, i32 } @llvm.vp.load.ff.nxv8i16.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x i16>, i32 } %load
+}
+
+define { <vscale x 16 x i16>, i32 } @vploadff_nxv16i16(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x i16>, i32 } @llvm.vp.load.ff.nxv16i16.p0(ptr %ptr, <vscale x 16 x i1> %m, i32 %evl)
+ ret { <vscale x 16 x i16>, i32 } %load
+}
+
+define { <vscale x 16 x i16>, i32 } @vploadff_nxv16i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x i16>, i32 } @llvm.vp.load.ff.nxv16i16.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 16 x i16>, i32 } %load
+}
+
+define { <vscale x 32 x i16>, i32 } @vploadff_nxv32i16(ptr %ptr, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x i16>, i32 } @llvm.vp.load.ff.nxv32i16.p0(ptr %ptr, <vscale x 32 x i1> %m, i32 %evl)
+ ret { <vscale x 32 x i16>, i32 } %load
+}
+
+define { <vscale x 32 x i16>, i32 } @vploadff_nxv32i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x i16>, i32 } @llvm.vp.load.ff.nxv32i16.p0(ptr %ptr, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 32 x i16>, i32 } %load
+}
+
+define { <vscale x 1 x i32>, i32 } @vploadff_nxv1i32(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i32>, i32 } @llvm.vp.load.ff.nxv1i32.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x i32>, i32 } %load
+}
+
+define { <vscale x 1 x i32>, i32 } @vploadff_nxv1i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i32>, i32 } @llvm.vp.load.ff.nxv1i32.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x i32>, i32 } %load
+}
+
+define { <vscale x 2 x i32>, i32 } @vploadff_nxv2i32(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i32>, i32 } @llvm.vp.load.ff.nxv2i32.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x i32>, i32 } %load
+}
+
+define { <vscale x 2 x i32>, i32 } @vploadff_nxv2i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i32>, i32 } @llvm.vp.load.ff.nxv2i32.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x i32>, i32 } %load
+}
+
+define { <vscale x 4 x i32>, i32 } @vploadff_nxv4i32(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i32>, i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x i32>, i32 } %load
+}
+
+define { <vscale x 4 x i32>, i32 } @vploadff_nxv4i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i32>, i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x i32>, i32 } %load
+}
+
+define { <vscale x 8 x i32>, i32 } @vploadff_nxv8i32(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i32>, i32 } @llvm.vp.load.ff.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x i32>, i32 } %load
+}
+
+define { <vscale x 8 x i32>, i32 } @vploadff_nxv8i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i32>, i32 } @llvm.vp.load.ff.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x i32>, i32 } %load
+}
+
+define { <vscale x 16 x i32>, i32 } @vploadff_nxv16i32(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x i32>, i32 } @llvm.vp.load.ff.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> %m, i32 %evl)
+ ret { <vscale x 16 x i32>, i32 } %load
+}
+
+define { <vscale x 16 x i32>, i32 } @vploadff_nxv16i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x i32>, i32 } @llvm.vp.load.ff.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 16 x i32>, i32 } %load
+}
+
+define { <vscale x 1 x i64>, i32 } @vploadff_nxv1i64(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i64>, i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x i64>, i32 } %load
+}
+
+define { <vscale x 1 x i64>, i32 } @vploadff_nxv1i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i64>, i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x i64>, i32 } %load
+}
+
+define { <vscale x 2 x i64>, i32 } @vploadff_nxv2i64(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i64>, i32 } @llvm.vp.load.ff.nxv2i64.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x i64>, i32 } %load
+}
+
+define { <vscale x 2 x i64>, i32 } @vploadff_nxv2i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i64>, i32 } @llvm.vp.load.ff.nxv2i64.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x i64>, i32 } %load
+}
+
+define { <vscale x 4 x i64>, i32 } @vploadff_nxv4i64(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i64>, i32 } @llvm.vp.load.ff.nxv4i64.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x i64>, i32 } %load
+}
+
+define { <vscale x 4 x i64>, i32 } @vploadff_nxv4i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i64>, i32 } @llvm.vp.load.ff.nxv4i64.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x i64>, i32 } %load
+}
+
+define { <vscale x 8 x i64>, i32 } @vploadff_nxv8i64(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i64>, i32 } @llvm.vp.load.ff.nxv8i64.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x i64>, i32 } %load
+}
+
+define { <vscale x 8 x i64>, i32 } @vploadff_nxv8i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i64>, i32 } @llvm.vp.load.ff.nxv8i64.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x i64>, i32 } %load
+}
+
+define { <vscale x 1 x half>, i32 } @vploadff_nxv1f16(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x half>, i32 } @llvm.vp.load.ff.nxv1f16.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x half>, i32 } %load
+}
+
+define { <vscale x 1 x half>, i32 } @vploadff_nxv1f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x half>, i32 } @llvm.vp.load.ff.nxv1f16.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x half>, i32 } %load
+}
+
+define { <vscale x 2 x half>, i32 } @vploadff_nxv2f16(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x half>, i32 } @llvm.vp.load.ff.nxv2f16.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x half>, i32 } %load
+}
+
+define { <vscale x 2 x half>, i32 } @vploadff_nxv2f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x half>, i32 } @llvm.vp.load.ff.nxv2f16.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x half>, i32 } %load
+}
+
+define { <vscale x 4 x half>, i32 } @vploadff_nxv4f16(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x half>, i32 } @llvm.vp.load.ff.nxv4f16.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x half>, i32 } %load
+}
+
+define { <vscale x 4 x half>, i32 } @vploadff_nxv4f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x half>, i32 } @llvm.vp.load.ff.nxv4f16.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x half>, i32 } %load
+}
+
+define { <vscale x 8 x half>, i32 } @vploadff_nxv8f16(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x half>, i32 } @llvm.vp.load.ff.nxv8f16.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x half>, i32 } %load
+}
+
+define { <vscale x 8 x half>, i32 } @vploadff_nxv8f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x half>, i32 } @llvm.vp.load.ff.nxv8f16.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x half>, i32 } %load
+}
+
+define { <vscale x 16 x half>, i32 } @vploadff_nxv16f16(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x half>, i32 } @llvm.vp.load.ff.nxv16f16.p0(ptr %ptr, <vscale x 16 x i1> %m, i32 %evl)
+ ret { <vscale x 16 x half>, i32 } %load
+}
+
+define { <vscale x 16 x half>, i32 } @vploadff_nxv16f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x half>, i32 } @llvm.vp.load.ff.nxv16f16.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 16 x half>, i32 } %load
+}
+
+define { <vscale x 32 x half>, i32 } @vploadff_nxv32f16(ptr %ptr, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x half>, i32 } @llvm.vp.load.ff.nxv32f16.p0(ptr %ptr, <vscale x 32 x i1> %m, i32 %evl)
+ ret { <vscale x 32 x half>, i32 } %load
+}
+
+define { <vscale x 32 x half>, i32 } @vploadff_nxv32f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x half>, i32 } @llvm.vp.load.ff.nxv32f16.p0(ptr %ptr, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 32 x half>, i32 } %load
+}
+
+define { <vscale x 1 x float>, i32 } @vploadff_nxv1f32(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x float>, i32 } @llvm.vp.load.ff.nxv1f32.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x float>, i32 } %load
+}
+
+define { <vscale x 1 x float>, i32 } @vploadff_nxv1f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x float>, i32 } @llvm.vp.load.ff.nxv1f32.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x float>, i32 } %load
+}
+
+define { <vscale x 2 x float>, i32 } @vploadff_nxv2f32(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x float>, i32 } @llvm.vp.load.ff.nxv2f32.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x float>, i32 } %load
+}
+
+define { <vscale x 2 x float>, i32 } @vploadff_nxv2f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x float>, i32 } @llvm.vp.load.ff.nxv2f32.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x float>, i32 } %load
+}
+
+define { <vscale x 4 x float>, i32 } @vploadff_nxv4f32(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x float>, i32 } @llvm.vp.load.ff.nxv4f32.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x float>, i32 } %load
+}
+
+define { <vscale x 4 x float>, i32 } @vploadff_nxv4f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x float>, i32 } @llvm.vp.load.ff.nxv4f32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x float>, i32 } %load
+}
+
+define { <vscale x 8 x float>, i32 } @vploadff_nxv8f32(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x float>, i32 } @llvm.vp.load.ff.nxv8f32.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x float>, i32 } %load
+}
+
+define { <vscale x 8 x float>, i32 } @vploadff_nxv8f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x float>, i32 } @llvm.vp.load.ff.nxv8f32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x float>, i32 } %load
+}
+
+define { <vscale x 16 x float>, i32 } @vploadff_nxv16f32(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x float>, i32 } @llvm.vp.load.ff.nxv16f32.p0(ptr %ptr, <vscale x 16 x i1> %m, i32 %evl)
+ ret { <vscale x 16 x float>, i32 } %load
+}
+
+define { <vscale x 16 x float>, i32 } @vploadff_nxv16f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x float>, i32 } @llvm.vp.load.ff.nxv16f32.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 16 x float>, i32 } %load
+}
+
+define { <vscale x 1 x double>, i32 } @vploadff_nxv1f64(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x double>, i32 } @llvm.vp.load.ff.nxv1f64.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x double>, i32 } %load
+}
+
+define { <vscale x 1 x double>, i32 } @vploadff_nxv1f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x double>, i32 } @llvm.vp.load.ff.nxv1f64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x double>, i32 } %load
+}
+
+define { <vscale x 2 x double>, i32 } @vploadff_nxv2f64(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x double>, i32 } @llvm.vp.load.ff.nxv2f64.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x double>, i32 } %load
+}
+
+define { <vscale x 2 x double>, i32 } @vploadff_nxv2f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x double>, i32 } @llvm.vp.load.ff.nxv2f64.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x double>, i32 } %load
+}
+
+define { <vscale x 4 x double>, i32 } @vploadff_nxv4f64(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x double>, i32 } @llvm.vp.load.ff.nxv4f64.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x double>, i32 } %load
+}
+
+define { <vscale x 4 x double>, i32 } @vploadff_nxv4f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x double>, i32 } @llvm.vp.load.ff.nxv4f64.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x double>, i32 } %load
+}
+
+define { <vscale x 8 x double>, i32 } @vploadff_nxv8f64(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x double>, i32 } @llvm.vp.load.ff.nxv8f64.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x double>, i32 } %load
+}
+
+define { <vscale x 8 x double>, i32 } @vploadff_nxv8f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x double>, i32 } @llvm.vp.load.ff.nxv8f64.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x double>, i32 } %load
+}
+
+define { <vscale x 1 x bfloat>, i32 } @vploadff_nxv1bf16(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x bfloat>, i32 } @llvm.vp.load.ff.nxv1bf16.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x bfloat>, i32 } %load
+}
+
+define { <vscale x 1 x bfloat>, i32 } @vploadff_nxv1bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x bfloat>, i32 } @llvm.vp.load.ff.nxv1bf16.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x bfloat>, i32 } %load
+}
+
+define { <vscale x 2 x bfloat>, i32 } @vploadff_nxv2bf16(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x bfloat>, i32 } @llvm.vp.load.ff.nxv2bf16.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x bfloat>, i32 } %load
+}
+
+define { <vscale x 2 x bfloat>, i32 } @vploadff_nxv2bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x bfloat>, i32 } @llvm.vp.load.ff.nxv2bf16.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x bfloat>, i32 } %load
+}
+
+define { <vscale x 4 x bfloat>, i32 } @vploadff_nxv4bf16(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x bfloat>, i32 } @llvm.vp.load.ff.nxv4bf16.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x bfloat>, i32 } %load
+}
+
+define { <vscale x 4 x bfloat>, i32 } @vploadff_nxv4bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x bfloat>, i32 } @llvm.vp.load.ff.nxv4bf16.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x bfloat>, i32 } %load
+}
+
+define { <vscale x 8 x bfloat>, i32 } @vploadff_nxv8bf16(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x bfloat>, i32 } @llvm.vp.load.ff.nxv8bf16.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x bfloat>, i32 } %load
+}
+
+define { <vscale x 8 x bfloat>, i32 } @vploadff_nxv8bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x bfloat>, i32 } @llvm.vp.load.ff.nxv8bf16.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x bfloat>, i32 } %load
+}
+
+define { <vscale x 16 x bfloat>, i32 } @vploadff_nxv16bf16(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x bfloat>, i32 } @llvm.vp.load.ff.nxv16bf16.p0(ptr %ptr, <vscale x 16 x i1> %m, i32 %evl)
+ ret { <vscale x 16 x bfloat>, i32 } %load
+}
+
+define { <vscale x 16 x bfloat>, i32 } @vploadff_nxv16bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x bfloat>, i32 } @llvm.vp.load.ff.nxv16bf16.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 16 x bfloat>, i32 } %load
+}
+
+define { <vscale x 32 x bfloat>, i32 } @vploadff_nxv32bf16(ptr %ptr, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x bfloat>, i32 } @llvm.vp.load.ff.nxv32bf16.p0(ptr %ptr, <vscale x 32 x i1> %m, i32 %evl)
+ ret { <vscale x 32 x bfloat>, i32 } %load
+}
+
+define { <vscale x 32 x bfloat>, i32 } @vploadff_nxv32bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x bfloat>, i32 } @llvm.vp.load.ff.nxv32bf16.p0(ptr %ptr, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 32 x bfloat>, i32 } %load
+}
+
+define { <vscale x 3 x i8>, i32 } @vploadff_nxv3i8(ptr %ptr, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv3i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 3 x i8>, i32 } @llvm.vp.load.ff.nxv3i8.p0(ptr %ptr, <vscale x 3 x i1> %m, i32 %evl)
+ ret { <vscale x 3 x i8>, i32 } %load
+}
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index d6ad7599ce461..a101979ee6a4a 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -100,6 +100,8 @@ class VPIntrinsicTest : public testing::Test {
"i32*>, <8 x i1>, i32) ";
Str << " declare <8 x i32> @llvm.vp.load.v8i32.p0v8i32(<8 x i32>*, <8 x "
"i1>, i32) ";
+ Str << " declare {<8 x i32>, i32} @llvm.vp.load.ff.v8i32.p0v8i32(<8 x "
+ "i32>*, <8 x i1>, i32) ";
Str << "declare <8 x i32> "
"@llvm.experimental.vp.strided.load.v8i32.i32(i32*, i32, <8 "
"x i1>, i32) ";
>From 5550dda5a8dc4725c34c6026725c41a438dbe240 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Fri, 25 Jul 2025 16:24:16 -0700
Subject: [PATCH 2/8] Enable earlyexit loop with EVL
---
.../Vectorize/LoopVectorizationLegality.cpp | 10 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 5 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 25 +++++
...ectorize-force-tail-with-evl-early-exit.ll | 95 +++++++++++++++++++
4 files changed, 127 insertions(+), 8 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 969d225c6ef2e..cfcf68d445992 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1912,10 +1912,12 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
bool LoopVectorizationLegality::canFoldTailByMasking() const {
// The only loops we can vectorize without a scalar epilogue, are loops with
- // a bottom-test and a single exiting block. We'd have to handle the fact
- // that not every instruction executes on the last iteration. This will
- // require a lane mask which varies through the vector loop body. (TODO)
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ // a bottom-test and a single exiting block or those with early exits. We'd
+ // have to handle the fact that not every instruction executes on the last
+ // iteration. This will require a lane mask which varies through the vector
+ // loop body. (TODO)
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
+ !hasUncountableEarlyExit()) {
LLVM_DEBUG(
dbgs()
<< "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fe93fcd28348a..a20133ee8f3ae 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8677,10 +8677,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// count is >= increment and a multiple of the increment.
bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
if (!HasNUW) {
- auto *IVInc = Plan->getVectorLoopRegion()
- ->getExitingBasicBlock()
- ->getTerminator()
- ->getOperand(0);
+ auto *IVInc = Plan->getCanonicalIV()->getBackedgeValue();
assert(match(IVInc, m_VPInstruction<Instruction::Add>(
m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
"Did not find the canonical IV increment");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 47a9ff09352cb..e8f07610604cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2047,6 +2047,16 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
// Replace the original terminator with BranchOnCond. We have to invert the
// mask here because a true condition means jumping to the exit block.
auto *NotMask = Builder.createNot(ALM, DL);
+ using namespace VPlanPatternMatch;
+ if (VPValue *IsEarlyExitTaken = nullptr; match(
+ OriginalTerminator, m_BranchOnCond(m_BinaryOr(
+ m_VPValue(IsEarlyExitTaken), m_VPValue())))) {
+ auto *AnyExitTaken =
+ Builder.createNaryOp(Instruction::Or, {IsEarlyExitTaken, NotMask});
+ OriginalTerminator->setOperand(0, AnyExitTaken);
+ return LaneMaskPhi;
+ }
+
Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
OriginalTerminator->eraseFromParent();
return LaneMaskPhi;
@@ -2442,6 +2452,21 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
// Skip single-iteration loop region
if (match(LatchExitingBr, m_BranchOnCond(m_True())))
return;
+
+ // Replace VectorTripCount used in loop with early-exits
+ if (VPValue *VPMainExitCond = nullptr;
+ match(LatchExitingBr, m_BranchOnCond(m_BinaryOr(
+ m_VPValue(), m_VPValue(VPMainExitCond)))) &&
+ match(VPMainExitCond, m_VPInstruction<Instruction::ICmp>(
+ m_Specific(EVLIncrement),
+ m_Specific(&Plan.getVectorTripCount())))) {
+ // Expected pattern here is:
+ // EMIT vp<%main.exit.cond> = icmp eq vp<%evl.next>, vp<%vtc>
+ // EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
+ // EMIT branch-on-cond vp<%exit.cond>
+ VPMainExitCond->getDefiningRecipe()->setOperand(1, Plan.getTripCount());
+ return;
+ }
assert(LatchExitingBr &&
match(LatchExitingBr,
m_BranchOnCount(m_VPValue(EVLIncrement),
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll
new file mode 100644
index 0000000000000..dc2a904f06ce0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S -enable-early-exit-vectorization %s | FileCheck %s
+
+declare void @init(ptr)
+
+define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
+; CHECK-LABEL: define i64 @multi_exiting_to_different_exits_live_in_exit_values(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[SRC:%.*]] = alloca [128 x i32], align 4
+; CHECK-NEXT: call void @init(ptr [[SRC]])
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 128, [[TMP2]]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i64 128, [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 10)
+; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP12]])
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 128
+; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
+; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_SPLIT]]:
+; CHECK-NEXT: br i1 [[TMP17]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[E2:.*]]
+; CHECK: [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT: br label %[[E1:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK: [[LOOP_HEADER]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[L]], 10
+; CHECK-NEXT: br i1 [[C_1]], label %[[E1]], label %[[LOOP_LATCH]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[INC]] = add nuw i64 [[IV1]], 1
+; CHECK-NEXT: [[C_2:%.*]] = icmp eq i64 [[INC]], 128
+; CHECK-NEXT: br i1 [[C_2]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[E1]]:
+; CHECK-NEXT: [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: ret i64 [[P1]]
+; CHECK: [[E2]]:
+; CHECK-NEXT: [[P2:%.*]] = phi i64 [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i64 [[P2]]
+;
+entry:
+ %src = alloca [128 x i32]
+ call void @init(ptr %src)
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+ %l = load i32, ptr %gep.src
+ %c.1 = icmp eq i32 %l, 10
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e2, label %loop.header
+
+e1:
+ %p1 = phi i64 [ 0, %loop.header ]
+ ret i64 %p1
+
+e2:
+ %p2 = phi i64 [ 1, %loop.latch ]
+ ret i64 %p2
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]}
+;.
>From 51d5a9e5744fdbbe871ba80897bb2362ff48c869 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Wed, 12 Mar 2025 00:14:28 -0700
Subject: [PATCH 3/8] Add WidenFFLoad
---
.../llvm/Analysis/TargetTransformInfo.h | 3 +
.../llvm/Analysis/TargetTransformInfoImpl.h | 2 +
.../Vectorize/LoopVectorizationLegality.h | 7 +
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 +
.../Target/RISCV/RISCVTargetTransformInfo.h | 3 +
.../Vectorize/LoopVectorizationLegality.cpp | 63 ++++++-
.../Transforms/Vectorize/LoopVectorize.cpp | 35 ++++
llvm/lib/Transforms/Vectorize/VPlan.h | 70 ++++++++
.../Transforms/Vectorize/VPlanAnalysis.cpp | 3 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 58 ++++++
.../Transforms/Vectorize/VPlanTransforms.cpp | 155 +++++++++-------
llvm/lib/Transforms/Vectorize/VPlanValue.h | 4 +
.../Transforms/Vectorize/VPlanVerifier.cpp | 5 +-
.../Transforms/LoopVectorize/RISCV/find.ll | 169 ++++++++++++++++++
.../RISCV/tail-folding-interleave.ll | 4 +-
.../LoopVectorize/RISCV/uniform-load-store.ll | 4 +-
16 files changed, 510 insertions(+), 79 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/find.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 7928835f7f84d..5d562f0e0c180 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1857,6 +1857,9 @@ class TargetTransformInfo {
/// \returns True if the target supports scalable vectors.
LLVM_ABI bool supportsScalableVectors() const;
+ /// \returns True if the target supports speculative loads.
+ LLVM_ABI bool supportsSpeculativeLoads() const;
+
/// \return true when scalable vectorization is preferred.
LLVM_ABI bool enableScalableVectorization() const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 2ea87b3c62895..5655446ccf3ec 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1106,6 +1106,8 @@ class TargetTransformInfoImplBase {
virtual bool supportsScalableVectors() const { return false; }
+ virtual bool supportsSpeculativeLoads() const { return false; }
+
virtual bool enableScalableVectorization() const { return false; }
virtual bool hasActiveVectorLength() const { return false; }
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index cba37363d0474..8026242ac7fe6 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -453,6 +453,10 @@ class LoopVectorizationLegality {
/// Returns a list of all known histogram operations in the loop.
bool hasHistograms() const { return !Histograms.empty(); }
+ const SmallPtrSetImpl<const Instruction *> &getSpeculativeLoads() const {
+ return SpeculativeLoads;
+ }
+
PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
return &PSE;
}
@@ -645,6 +649,9 @@ class LoopVectorizationLegality {
/// may work on the same memory location.
SmallVector<HistogramInfo, 1> Histograms;
+ /// Hold all loads that need to be speculative.
+ SmallPtrSet<const Instruction *, 4> SpeculativeLoads;
+
/// BFI and PSI are used to check for profile guided size optimizations.
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 55ba52a1079ce..6482ba003bf38 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1457,6 +1457,10 @@ bool TargetTransformInfo::supportsScalableVectors() const {
return TTIImpl->supportsScalableVectors();
}
+bool TargetTransformInfo::supportsSpeculativeLoads() const {
+ return TTIImpl->supportsSpeculativeLoads();
+}
+
bool TargetTransformInfo::enableScalableVectorization() const {
return TTIImpl->enableScalableVectorization();
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index d62d99cf31899..a6860fdcbde42 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -110,6 +110,9 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
bool supportsScalableVectors() const override {
return ST->hasVInstructions();
}
+ bool supportsSpeculativeLoads() const override {
+ return ST->hasVInstructions();
+ }
bool enableOrderedReductions() const override { return true; }
bool enableScalableVectorization() const override {
return ST->hasVInstructions();
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index cfcf68d445992..6beb0587a5925 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -23,6 +23,7 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
@@ -1769,15 +1770,61 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
assert(LatchBB->getUniquePredecessor() == SingleUncountableEdge->first &&
"Expected latch predecessor to be the early exiting block");
- // TODO: Handle loops that may fault.
Predicates.clear();
- if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
- &Predicates)) {
- reportVectorizationFailure(
- "Loop may fault",
- "Cannot vectorize potentially faulting early exit loop",
- "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
- return false;
+ SmallVector<LoadInst *, 4> NonDerefLoads;
+ unsigned NumLoad = 0;
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ for (Instruction &I : *BB) {
+ if (auto *LI = dyn_cast<LoadInst>(&I)) {
+ NumLoad++;
+ if (!isDereferenceableAndAlignedInLoop(LI, TheLoop, *PSE.getSE(), *DT,
+ AC, &Predicates))
+ NonDerefLoads.push_back(LI);
+ } else if (I.mayReadFromMemory() || I.mayWriteToMemory() ||
+ I.mayThrow()) {
+ reportVectorizationFailure(
+ "Loop may fault on instructions other than load",
+ "Cannot vectorize potentially faulting early exit loop",
+ "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+ }
+ }
+ // Checks if non-dereferencible loads are supported.
+ if (!NonDerefLoads.empty()) {
+ if (!TTI->supportsSpeculativeLoads()) {
+ reportVectorizationFailure(
+ "Loop may fault",
+ "Cannot vectorize potentially faulting early exit loop",
+ "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+ // Supports single speculative load for now.
+ if (NumLoad > 1) {
+ reportVectorizationFailure("More than one speculative load",
+ "SpeculativeLoadNotSupported", ORE, TheLoop);
+ return false;
+ }
+ }
+ // Speculative loads need to be unit-stride.
+ for (LoadInst *LI : NonDerefLoads) {
+ if (LI->getParent() != TheLoop->getHeader()) {
+ reportVectorizationFailure("Cannot vectorize predicated speculative load",
+ "SpeculativeLoadNeedsPredication", ORE,
+ TheLoop);
+ return false;
+ }
+ int Stride = isConsecutivePtr(LI->getType(), LI->getPointerOperand());
+ if (Stride != 1) {
+ reportVectorizationFailure("Loop contains non-unit-stride load",
+ "Cannot vectorize early exit loop with "
+ "speculative non-unit-stride load",
+ "SpeculativeNonUnitStrideLoadEarlyExitLoop",
+ ORE, TheLoop);
+ return false;
+ }
+ SpeculativeLoads.insert(LI);
+ LLVM_DEBUG(dbgs() << "LV: Found speculative load: " << *LI << "\n");
}
[[maybe_unused]] const SCEV *SymbolicMaxBTC =
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a20133ee8f3ae..0d6d69a069e1b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -402,6 +402,10 @@ static cl::opt<bool> EnableEarlyExitVectorization(
cl::desc(
"Enable vectorization of early exit loops with uncountable exits."));
+static cl::opt<bool> EnableSpeculativeLoads(
+ "enable-speculative-load", cl::init(false), cl::Hidden,
+ cl::desc("Enable vectorization of loops with speculative loads."));
+
// Likelyhood of bypassing the vectorized loop because there are zero trips left
// after prolog. See `emitIterationCountCheck`.
static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
@@ -1376,6 +1380,9 @@ class LoopVectorizationCostModel {
if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
return;
+ // Do not override EVL styles for speculative loads.
+ if (!Legal->getSpeculativeLoads().empty())
+ return;
// Override EVL styles if needed.
// FIXME: Investigate opportunity for fixed vector factor.
bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
@@ -4229,6 +4236,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPWidenPointerInductionSC:
case VPDef::VPReductionPHISC:
case VPDef::VPInterleaveSC:
+ case VPDef::VPWidenFFLoadEVLSC:
+ case VPDef::VPWidenFFLoadSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
case VPDef::VPWidenStoreEVLSC:
@@ -7732,6 +7741,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
+
if (Consecutive) {
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
@@ -7756,6 +7766,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}
+ if (Legal->getSpeculativeLoads().contains(I)) {
+ LoadInst *Load = dyn_cast<LoadInst>(I);
+ return new VPWidenFFLoadRecipe(*Load, Ptr, Mask, VPIRMetadata(*Load, LVer),
+ I->getDebugLoc());
+ }
+
if (LoadInst *Load = dyn_cast<LoadInst>(I))
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
VPIRMetadata(*Load, LVer), I->getDebugLoc());
@@ -9982,6 +9998,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
+ if (!LVL.getSpeculativeLoads().empty()) {
+ if (!EnableSpeculativeLoads) {
+ reportVectorizationFailure("Auto-vectorization of loops with speculative "
+ "load is not enabled",
+ "SpeculativeLoadsDisabled", ORE, L);
+ return false;
+ }
+ // DataWithEVL is needed to lower VPWidenFFLoadRecipe into
+ // VPWidenFFLoadEVLRecipe.
+ if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL ||
+ PreferPredicateOverEpilogue !=
+ PreferPredicateTy::PredicateOrDontVectorize) {
+ reportVectorizationFailure("Auto-vectorization of loops with speculative "
+ "load is not enabled",
+ "SpeculativeLoadsDisabled", ORE, L);
+ return false;
+ }
+ }
+
// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
// even evaluating whether vectorization is profitable. Since we cannot modify
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a5de5933d5ff1..32745a72c67fb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -559,6 +559,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPBranchOnMaskSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
+ case VPRecipeBase::VPWidenFFLoadEVLSC:
+ case VPRecipeBase::VPWidenFFLoadSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
case VPRecipeBase::VPWidenStoreEVLSC:
@@ -2980,6 +2982,8 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenFFLoadSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenFFLoadEVLSC ||
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
}
@@ -3061,6 +3065,72 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
}
};
+struct VPWidenFFLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
+ VPWidenFFLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
+ const VPIRMetadata &Metadata, DebugLoc DL)
+ : VPWidenMemoryRecipe(VPDef::VPWidenFFLoadSC, Load, {Addr},
+ /*Consecutive*/ true, /*Reverse*/ false, Metadata,
+ DL),
+ VPValue(this, &Load) {
+ setMask(Mask);
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenFFLoadSC);
+
+ void execute(VPTransformState &State) override {
+ llvm_unreachable("cannot execute this recipe, should be replaced by "
+ "VPWidenFFLoadEVLRecipe");
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getAddr();
+ }
+};
+
+struct VPWidenFFLoadEVLRecipe final : public VPWidenMemoryRecipe,
+ public VPValue {
+ VPWidenFFLoadEVLRecipe(VPWidenFFLoadRecipe &L, VPValue &EVL, VPValue *Mask)
+ : VPWidenMemoryRecipe(VPDef::VPWidenFFLoadEVLSC, L.getIngredient(),
+ {L.getAddr(), &EVL}, true, false, L,
+ L.getDebugLoc()),
+ VPValue(this, &getIngredient()) {
+ new VPValue(nullptr, this); // newVL
+ setMask(Mask);
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenFFLoadEVLSC);
+
+ /// Return the EVL operand.
+ VPValue *getEVL() const { return getOperand(1); }
+
+ /// Generate a wide load or gather.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ // Widened, consecutive loads operations only demand the first lane of
+ // their address.
+ return Op == getEVL() || Op == getAddr();
+ }
+};
+
/// A recipe for widening load operations with vector-predication intrinsics,
/// using the address to load from, the explicit vector length and an optional
/// mask.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 16072f268a98c..b23bc9c35698f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -186,7 +186,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
- assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
+ assert((isa<VPWidenLoadRecipe, VPWidenFFLoadRecipe, VPWidenFFLoadEVLRecipe,
+ VPWidenLoadEVLRecipe>(R)) &&
"Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 68e7c20a070f4..086b8b23a67b0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -84,6 +84,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenIntOrFpInductionSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
+ case VPWidenFFLoadEVLSC:
+ case VPWidenFFLoadSC:
case VPWidenPHISC:
case VPWidenSC:
case VPWidenSelectSC: {
@@ -107,6 +109,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
+ case VPWidenFFLoadEVLSC:
+ case VPWidenFFLoadSC:
return true;
case VPReplicateSC:
return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
@@ -184,6 +188,9 @@ bool VPRecipeBase::mayHaveSideEffects() const {
"underlying instruction has side-effects");
return false;
}
+ case VPWidenFFLoadEVLSC:
+ case VPWidenFFLoadSC:
+ return true;
case VPInterleaveSC:
return mayWriteToMemory();
case VPWidenLoadEVLSC:
@@ -960,6 +967,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
case VPInstruction::FirstActiveLane: {
+ if (VF.isScalar())
+ return InstructionCost::getInvalid();
// Calculate the cost of determining the lane index.
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
@@ -3155,6 +3164,55 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPWidenFFLoadEVLRecipe::execute(VPTransformState &State) {
+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+ auto &Builder = State.Builder;
+ State.setDebugLocFrom(getDebugLoc());
+
+ Value *EVL = State.get(getEVL(), VPLane(0));
+ Value *Addr = State.get(getAddr(), true);
+ Value *Mask = nullptr;
+ if (VPValue *VPMask = getMask())
+ Mask = State.get(VPMask);
+ else
+ Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+ CallInst *NewLI =
+ Builder.CreateIntrinsic(Intrinsic::vp_load_ff, {DataTy, Addr->getType()},
+ {Addr, Mask, EVL}, nullptr, "vp.op.load.ff");
+ NewLI->addParamAttr(
+ 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+ applyMetadata(*NewLI);
+ Value *V = cast<Instruction>(Builder.CreateExtractValue(NewLI, 0));
+ Value *VL = Builder.CreateExtractValue(NewLI, 1);
+ State.set(getVPValue(0), V);
+ State.set(getVPValue(1), VL, /*NeedsScalar=*/true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenFFLoadRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN ";
+ printAsOperand(O, SlotTracker);
+ O << " = fault-only-first-load ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenFFLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN ";
+ printAsOperand(O, SlotTracker);
+ O << ", ";
+ getVPValue(1)->printAsOperand(O, SlotTracker);
+ O << " = vp.load.ff ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
/// Use all-true mask for reverse rather than actual mask, as it avoids a
/// dependence w/o affecting the result.
static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e8f07610604cc..557bd838d2904 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2149,8 +2149,7 @@ void VPlanTransforms::addActiveLaneMask(
/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
/// \p EVL The explicit vector length parameter of vector-predication
/// intrinsics.
-static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
- VPRecipeBase &CurRecipe,
+static VPRecipeBase *optimizeMaskToEVL(VPlan &Plan, VPRecipeBase &CurRecipe,
VPTypeAnalysis &TypeInfo,
VPValue &AllOneMask, VPValue &EVL) {
using namespace llvm::VPlanPatternMatch;
@@ -2158,9 +2157,13 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
assert(OrigMask && "Unmasked recipe when folding tail");
// HeaderMask will be handled using EVL.
VPValue *Mask;
- if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
+ VPValue *HeaderMask;
+ if (match(OrigMask, m_LogicalAnd(m_VPValue(HeaderMask), m_VPValue(Mask))) &&
+ vputils::isHeaderMask(HeaderMask, Plan))
return Mask;
- return HeaderMask == OrigMask ? nullptr : OrigMask;
+ if (vputils::isHeaderMask(OrigMask, Plan))
+ return nullptr;
+ return OrigMask;
};
return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
@@ -2168,6 +2171,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPValue *NewMask = GetNewMask(L->getMask());
return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
})
+ .Case<VPWidenFFLoadRecipe>([&](VPWidenFFLoadRecipe *L) {
+ VPValue *NewMask = GetNewMask(L->getMask());
+ return new VPWidenFFLoadEVLRecipe(*L, EVL, NewMask);
+ })
.Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
VPValue *NewMask = GetNewMask(S->getMask());
return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
@@ -2182,8 +2189,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
// select(header_mask, LHS, RHS)
// into vector predication merge.
// vp.merge(all-true, LHS, RHS, EVL)
- if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
- m_VPValue(RHS))))
+ VPValue *HeaderMask;
+ if (!match(VPI, m_Select(m_VPValue(HeaderMask), m_VPValue(LHS),
+ m_VPValue(RHS))) ||
+ !vputils::isHeaderMask(HeaderMask, Plan))
return nullptr;
// Use all true as the condition because this transformation is
// limited to selects whose condition is a header mask.
@@ -2195,7 +2204,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
}
/// Replace recipes with their EVL variants.
-static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
+static VPValue *transformRecipestoEVLRecipes(VPlan &Plan) {
Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
VPTypeAnalysis TypeInfo(CanonicalIVType);
LLVMContext &Ctx = CanonicalIVType->getContext();
@@ -2207,7 +2216,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
VPWidenIntOrFpInductionRecipe>) &&
"User of VF that we can't transform to EVL.");
- Plan.getVF().replaceAllUsesWith(&EVL);
// Defer erasing recipes till the end so that we don't invalidate the
// VPTypeAnalysis cache.
@@ -2215,6 +2223,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
// Create a scalar phi to track the previous EVL if fixed-order recurrence is
// contained.
+ VPInstruction *PrevEVL = nullptr;
bool ContainsFORs =
any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
if (ContainsFORs) {
@@ -2227,79 +2236,97 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
DebugLoc());
Builder.setInsertPoint(Header, Header->getFirstNonPhi());
- VPValue *PrevEVL =
- Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
-
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
- vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
- for (VPRecipeBase &R : *VPBB) {
- using namespace VPlanPatternMatch;
- VPValue *V1, *V2;
- if (!match(&R,
- m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
- m_VPValue(V1), m_VPValue(V2))))
- continue;
+ PrevEVL = Builder.createScalarPhi({MaxEVL}, DebugLoc(), "prev.evl");
+ }
+
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan.getEntry());
+ VPValue *LastEVL = nullptr;
+ VPValue *VF = &Plan.getVF();
+
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ for (VPRecipeBase &CurRecipe : *VPBB) {
+ auto *VPI = dyn_cast<VPInstruction>(&CurRecipe);
+ if (VPI && (VPI->getOpcode() == VPInstruction::ExplicitVectorLength)) {
+ assert((LastEVL == nullptr) && "EVL should be set only once");
+ LastEVL = VPI;
+ continue;
+ }
+ if (!LastEVL)
+ continue;
+ if (isa<VPVectorEndPointerRecipe>(&CurRecipe)) {
+ if (CurRecipe.getOperand(1) == VF)
+ CurRecipe.setOperand(1, LastEVL);
+ continue;
+ }
+ if (isa<VPScalarIVStepsRecipe>(&CurRecipe)) {
+ if (CurRecipe.getOperand(2) == VF)
+ CurRecipe.setOperand(2, LastEVL);
+ continue;
+ }
+ VPValue *V1, *V2;
+ using namespace VPlanPatternMatch;
+ if (match(&CurRecipe,
+ m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
+ m_VPValue(V1), m_VPValue(V2)))) {
VPValue *Imm = Plan.getOrAddLiveIn(
ConstantInt::getSigned(Type::getInt32Ty(Ctx), -1));
VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_splice,
- {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
- TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
- VPSplice->insertBefore(&R);
- R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
- ToErase.push_back(&R);
+ {V1, V2, Imm, AllOneMask, PrevEVL, LastEVL},
+ TypeInfo.inferScalarType(CurRecipe.getVPSingleValue()),
+ CurRecipe.getDebugLoc());
+ VPSplice->insertBefore(&CurRecipe);
+ CurRecipe.getVPSingleValue()->replaceAllUsesWith(VPSplice);
+ ToErase.push_back(&CurRecipe);
+ continue;
}
- }
- }
-
- // Try to optimize header mask recipes away to their EVL variants.
- for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
- // TODO: Split optimizeMaskToEVL out and move into
- // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
- // tryToBuildVPlanWithVPRecipes beforehand.
- for (VPUser *U : collectUsersRecursively(HeaderMask)) {
- auto *CurRecipe = cast<VPRecipeBase>(U);
+ // TODO: Split optimizeMaskToEVL out and move into
+ // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
+ // tryToBuildVPlanWithVPRecipes beforehand.
VPRecipeBase *EVLRecipe =
- optimizeMaskToEVL(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
+ optimizeMaskToEVL(Plan, CurRecipe, TypeInfo, *AllOneMask, *LastEVL);
if (!EVLRecipe)
continue;
[[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
- assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
- "New recipe must define the same number of values as the "
- "original.");
- assert(
- NumDefVal <= 1 &&
- "Only supports recipes with a single definition or without users.");
- EVLRecipe->insertBefore(CurRecipe);
+ // Check if the recipe updates EVL
+ if (isa<VPWidenFFLoadEVLRecipe>(EVLRecipe)) {
+ VPValue *CurVPV = CurRecipe.getVPSingleValue();
+ CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(0));
+ LastEVL = EVLRecipe->getVPValue(1);
+ } else {
+ assert(NumDefVal == CurRecipe.getNumDefinedValues() &&
+ "New recipe must define the same number of values as the "
+ "original.");
+ assert(
+ NumDefVal <= 1 &&
+ "Only supports recipes with a single definition or without users.");
+ }
+ EVLRecipe->insertBefore(&CurRecipe);
if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
- VPValue *CurVPV = CurRecipe->getVPSingleValue();
+ VPValue *CurVPV = CurRecipe.getVPSingleValue();
CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
}
- ToErase.push_back(CurRecipe);
+ ToErase.push_back(&CurRecipe);
}
-
- // Replace header masks with a mask equivalent to predicating by EVL:
- //
- // icmp ule widen-canonical-iv backedge-taken-count
- // ->
- // icmp ult step-vector, EVL
- VPRecipeBase *EVLR = EVL.getDefiningRecipe();
- VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
- Type *EVLType = TypeInfo.inferScalarType(&EVL);
- VPValue *EVLMask = Builder.createICmp(
- CmpInst::ICMP_ULT,
- Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
- HeaderMask->replaceAllUsesWith(EVLMask);
- ToErase.push_back(HeaderMask->getDefiningRecipe());
}
-
+ for (VPRecipeBase &CurRecipe : Header->phis()) {
+ if (isa<VPWidenIntOrFpInductionRecipe>(&CurRecipe)) {
+ if (CurRecipe.getOperand(2) == VF)
+ CurRecipe.setOperand(2, LastEVL);
+ continue;
+ }
+ }
+ if (PrevEVL)
+ PrevEVL->addOperand(LastEVL);
for (VPRecipeBase *R : reverse(ToErase)) {
SmallVector<VPValue *> PossiblyDead(R->operands());
R->eraseFromParent();
for (VPValue *Op : PossiblyDead)
recursivelyDeleteDeadRecipes(Op);
}
+ return LastEVL;
}
/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
@@ -2370,13 +2397,13 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
}
- auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
- DebugLoc());
+ Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL, DebugLoc());
+
+ VPValue *OpVPEVL = transformRecipestoEVLRecipes(Plan);
auto *CanonicalIVIncrement =
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
Builder.setInsertPoint(CanonicalIVIncrement);
- VPValue *OpVPEVL = VPEVL;
auto *I32Ty = Type::getInt32Ty(CanIVTy->getContext());
OpVPEVL = Builder.createScalarZExtOrTrunc(
@@ -2389,8 +2416,6 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
EVLPhi->addOperand(NextEVLIV);
- transformRecipestoEVLRecipes(Plan, *VPEVL);
-
// Replace all uses of VPCanonicalIVPHIRecipe by
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 24f6d61512ef6..179e557731ce9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -40,6 +40,7 @@ class VPUser;
class VPRecipeBase;
class VPInterleaveRecipe;
class VPPhiAccessors;
+class VPWidenFFLoadEVLRecipe;
// This is the base class of the VPlan Def/Use graph, used for modeling the data
// flow into, within and out of the VPlan. VPValues can stand for live-ins
@@ -51,6 +52,7 @@ class LLVM_ABI_FOR_TEST VPValue {
friend class VPInterleaveRecipe;
friend class VPlan;
friend class VPExpressionRecipe;
+ friend class VPWidenFFLoadEVLRecipe;
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
@@ -348,6 +350,8 @@ class VPDef {
VPWidenCastSC,
VPWidenGEPSC,
VPWidenIntrinsicSC,
+ VPWidenFFLoadSC,
+ VPWidenFFLoadEVLSC,
VPWidenLoadEVLSC,
VPWidenLoadSC,
VPWidenStoreEVLSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 57d01cbefbe26..252754724caa2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -167,7 +167,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
}
return VerifyEVLUse(*R, 2);
})
- .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>(
+ .Case<VPWidenLoadEVLRecipe, VPWidenFFLoadEVLRecipe,
+ VPVectorEndPointerRecipe>(
[&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
.Case<VPInstructionWithType>(
[&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
@@ -175,6 +176,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
if (I->getOpcode() == Instruction::PHI ||
I->getOpcode() == Instruction::ICmp)
return VerifyEVLUse(*I, 1);
+ if (I->getOpcode() == Instruction::Sub)
+ return VerifyEVLUse(*I, 0);
switch (I->getOpcode()) {
case Instruction::Add:
break;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/find.ll b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
new file mode 100644
index 0000000000000..58f921239e00f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -enable-speculative-load -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s
+
+define ptr @find_with_liveout(ptr %first, ptr %last, ptr %value) {
+; CHECK-LABEL: define ptr @find_with_liveout(
+; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]], ptr [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
+; CHECK-NEXT: br i1 [[CMP_NOT6]], label %[[RETURN:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; CHECK: [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VALUE]], align 4
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[FIRST_ADDR_07:%.*]] = phi ptr [ [[FIRST]], %[[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[FIRST_ADDR_07]], align 4
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT: br i1 [[CMP1]], label %[[RETURN_LOOPEXIT:.*]], label %[[FOR_INC]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[FIRST_ADDR_07]], i64 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[LAST]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[RETURN_LOOPEXIT]], label %[[FOR_BODY]]
+; CHECK: [[RETURN_LOOPEXIT]]:
+; CHECK-NEXT: [[RETVAL_0_PH:%.*]] = phi ptr [ [[FIRST_ADDR_07]], %[[FOR_BODY]] ], [ [[LAST]], %[[FOR_INC]] ]
+; CHECK-NEXT: br label %[[RETURN]]
+; CHECK: [[RETURN]]:
+; CHECK-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[FIRST]], %[[ENTRY]] ], [ [[RETVAL_0_PH]], %[[RETURN_LOOPEXIT]] ]
+; CHECK-NEXT: ret ptr [[RETVAL_0]]
+;
+entry:
+ %cmp.not6 = icmp eq ptr %first, %last
+ br i1 %cmp.not6, label %return, label %for.body.lr.ph
+
+for.body.lr.ph:
+ %0 = load i32, ptr %value, align 4
+ br label %for.body
+
+for.body:
+ %first.addr.07 = phi ptr [ %first, %for.body.lr.ph ], [ %incdec.ptr, %for.inc ]
+ %1 = load i32, ptr %first.addr.07, align 4
+ %cmp1 = icmp eq i32 %1, %0
+ br i1 %cmp1, label %return.loopexit, label %for.inc
+
+for.inc:
+ %incdec.ptr = getelementptr inbounds i32, ptr %first.addr.07, i64 1
+ %cmp.not = icmp eq ptr %incdec.ptr, %last
+ br i1 %cmp.not, label %return.loopexit, label %for.body
+
+return.loopexit:
+ %retval.0.ph = phi ptr [ %first.addr.07, %for.body ], [ %last, %for.inc ]
+ br label %return
+
+return:
+ %retval.0 = phi ptr [ %first, %entry ], [ %retval.0.ph, %return.loopexit ]
+ ret ptr %retval.0
+}
+
+define i32 @find_without_liveout(ptr %first, ptr %last, ptr %value) {
+; CHECK-LABEL: define i32 @find_without_liveout(
+; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]], ptr [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[FIRST4:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT: [[LAST3:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT: [[FIRST2:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT: [[LAST1:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
+; CHECK-NEXT: br i1 [[CMP_NOT6]], label %[[RETURN:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; CHECK: [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VALUE]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[LAST3]], -4
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[FIRST4]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 2
+; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK: [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[LAST1]] to i2
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[FIRST2]] to i2
+; CHECK-NEXT: [[TMP7:%.*]] = sub i2 [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = zext i2 [[TMP7]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT: br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP4]], [[TMP11]]
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[TMP4]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 4
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT: [[VP_OP_LOAD_FF:%.*]] = call { <vscale x 4 x i32>, i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 0
+; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 1
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq <vscale x 4 x i32> [[TMP16]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP17]] to i64
+; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP18]])
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP4]]
+; CHECK-NEXT: [[TMP22:%.*]] = or i1 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_SPLIT]]:
+; CHECK-NEXT: br i1 [[TMP20]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[RETURN_LOOPEXIT:.*]]
+; CHECK: [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT: br label %[[RETURN_LOOPEXIT]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[FIRST]], %[[FOR_BODY_LR_PH]] ], [ [[FIRST]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[FIRST_ADDR_07:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[FIRST_ADDR_07]], align 4
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP23]], [[TMP0]]
+; CHECK-NEXT: br i1 [[CMP1]], label %[[RETURN_LOOPEXIT]], label %[[FOR_INC]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[FIRST_ADDR_07]], i64 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[LAST]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[RETURN_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[RETURN_LOOPEXIT]]:
+; CHECK-NEXT: [[RETVAL_0_PH:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ 1, %[[FOR_INC]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: br label %[[RETURN]]
+; CHECK: [[RETURN]]:
+; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RETVAL_0_PH]], %[[RETURN_LOOPEXIT]] ]
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+entry:
+ %cmp.not6 = icmp eq ptr %first, %last
+ br i1 %cmp.not6, label %return, label %for.body.lr.ph
+
+for.body.lr.ph:
+ %0 = load i32, ptr %value, align 4
+ br label %for.body
+
+for.body:
+ %first.addr.07 = phi ptr [ %first, %for.body.lr.ph ], [ %incdec.ptr, %for.inc ]
+ %1 = load i32, ptr %first.addr.07, align 4
+ %cmp1 = icmp eq i32 %1, %0
+ br i1 %cmp1, label %return.loopexit, label %for.inc
+
+for.inc:
+ %incdec.ptr = getelementptr inbounds i32, ptr %first.addr.07, i64 1
+ %cmp.not = icmp eq ptr %incdec.ptr, %last
+ br i1 %cmp.not, label %return.loopexit, label %for.body
+
+return.loopexit:
+ %retval.0.ph = phi i32 [ 0, %for.body ], [ 1, %for.inc ]
+ br label %return
+
+return:
+ %retval.0 = phi i32 [ 0, %entry ], [ %retval.0.ph, %return.loopexit ]
+ ret i32 %retval.0
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index 2678989731634..236cc92f7a0c0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -32,8 +32,8 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP13]]
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 41b96365af59d..20f32aae426b0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -867,8 +867,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]]
; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; TF-SCALABLE-NEXT: [[TMP13:%.*]] = zext i32 [[TMP9]] to i64
-; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP13]]
+; TF-SCALABLE-NEXT: [[TMP12:%.*]] = zext i32 [[TMP9]] to i64
+; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP12]]
; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; TF-SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
>From 16ea10e689aea27c13f23b793298158ce8e7df29 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Wed, 30 Jul 2025 02:49:10 -0700
Subject: [PATCH 4/8] Update tests after rebase
---
.../Transforms/LoopVectorize/RISCV/find.ll | 3 +-
.../Transforms/LoopVectorize/RISCV/pr88802.ll | 5 +---
.../RISCV/tail-folding-cond-reduction.ll | 30 ++++++++++++++-----
.../LoopVectorize/RISCV/uniform-load-store.ll | 9 ++----
...ectorize-force-tail-with-evl-early-exit.ll | 3 +-
5 files changed, 28 insertions(+), 22 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/find.ll b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
index 58f921239e00f..3e77436fbede9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
@@ -97,8 +97,7 @@ define i32 @find_without_liveout(ptr %first, ptr %last, ptr %value) {
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 4
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[VP_OP_LOAD_FF:%.*]] = call { <vscale x 4 x i32>, i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; CHECK-NEXT: [[VP_OP_LOAD_FF:%.*]] = call { <vscale x 4 x i32>, i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr align 4 [[NEXT_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 0
; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 1
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq <vscale x 4 x i32> [[TMP16]], [[BROADCAST_SPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index b82b7f3fb33b4..81ff3ba45e1fa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -34,13 +34,10 @@ define void @test(ptr %p, i64 %a, i8 %b) {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ]
; CHECK-NEXT: [[AVL:%.*]] = sub i32 9, [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP11]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = mul i32 1, [[TMP11]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP20]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <vscale x 2 x i32> [[TMP19]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ule <vscale x 2 x i32> [[VEC_IND]], splat (i32 8)
; CHECK-NEXT: [[TMP14:%.*]] = icmp sge <vscale x 2 x i32> [[VEC_IND]], splat (i32 2)
; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i32> [[TMP7]], <vscale x 2 x i32> [[TMP8]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
index 48e080c93f0b5..4d93868bbd22c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
@@ -250,19 +250,24 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL-OUTLOOP: vector.body:
; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP9]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true)
-; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
-; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = icmp ult <vscale x 4 x i32> [[TMP12]], [[BROADCAST_SPLAT]]
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
+; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT2]], [[TMP13]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sle <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
@@ -757,26 +762,34 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = mul <vscale x 4 x i64> [[TMP15]], splat (i64 1)
+; IF-EVL-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP16]]
; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP10]], splat (i32 1)
; IF-EVL-OUTLOOP-NEXT: [[INDUCTION1:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP13]]
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL-OUTLOOP: vector.body:
; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-OUTLOOP-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[VEC_IND2:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[IV]]
; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP14]], i64 0
-; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = mul i32 1, [[TMP14]]
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp ult <vscale x 4 x i32> [[TMP12]], [[BROADCAST_SPLAT4]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = zext i32 [[TMP14]] to i64
+; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = mul i64 1, [[TMP12]]
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP17]], i64 0
+; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = icmp sle <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_IND2]]
@@ -786,6 +799,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-NEXT: [[TMP24]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP14]])
; IF-EVL-OUTLOOP-NEXT: [[TMP25:%.*]] = zext i32 [[TMP14]] to i64
; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[IV]]
+; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT7]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT2]]
; IF-EVL-OUTLOOP-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 20f32aae426b0..3350f40105608 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -412,14 +412,11 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]]
; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP7]], i64 0
-; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP11]]
; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; TF-SCALABLE-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ult <vscale x 4 x i32> [[TMP16]], [[BROADCAST_SPLAT4]]
+; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], splat (i64 1024)
; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt <vscale x 4 x i64> [[VEC_IND]], splat (i64 10)
; TF-SCALABLE-NEXT: [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> zeroinitializer
; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.vp.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP10]], i32 [[TMP7]])
@@ -877,8 +874,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025
-; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; TF-SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025
+; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[SCALAR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll
index dc2a904f06ce0..06d0a4f146b11 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll
@@ -26,8 +26,7 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
; CHECK-NEXT: [[AVL:%.*]] = sub i64 128, [[EVL_BASED_IV]]
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[GEP_SRC]], i32 0
-; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[GEP_SRC]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 10)
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP5]] to i64
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]]
>From b1f0b92c7ff76bdf3e8d9a76b31490801abd2f95 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Fri, 1 Aug 2025 00:16:46 -0700
Subject: [PATCH 5/8] clang-formatted
---
llvm/include/llvm/IR/Intrinsics.td | 12 +++++++-----
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 ++--
2 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1a23276bc6ffb..14529f173db94 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1932,11 +1932,13 @@ def int_vp_load : DefaultAttrsIntrinsic<[ llvm_anyvector_ty],
llvm_i32_ty],
[ NoCapture<ArgIndex<0>>, IntrReadMem, IntrArgMemOnly ]>;
-def int_vp_load_ff : DefaultAttrsIntrinsic<[ llvm_anyvector_ty, llvm_i32_ty ],
- [ llvm_anyptr_ty,
- LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
- llvm_i32_ty],
- [ NoCapture<ArgIndex<0>>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>;
+def int_vp_load_ff
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty, llvm_i32_ty],
+ [llvm_anyptr_ty,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ llvm_i32_ty],
+ [NoCapture<ArgIndex<0>>, IntrNoSync, IntrReadMem,
+ IntrWillReturn, IntrArgMemOnly]>;
def int_vp_gather: DefaultAttrsIntrinsic<[ llvm_anyvector_ty],
[ LLVMVectorOfAnyPointersToElt<0>,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 557bd838d2904..32bb136446bf4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2282,8 +2282,8 @@ static VPValue *transformRecipestoEVLRecipes(VPlan &Plan) {
continue;
}
// TODO: Split optimizeMaskToEVL out and move into
- // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
- // tryToBuildVPlanWithVPRecipes beforehand.
+ // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run
+ // in tryToBuildVPlanWithVPRecipes beforehand.
VPRecipeBase *EVLRecipe =
optimizeMaskToEVL(Plan, CurRecipe, TypeInfo, *AllOneMask, *LastEVL);
if (!EVLRecipe)
>From 855874d00df8a4dcdd828377f413610013e4efbb Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Fri, 1 Aug 2025 00:35:01 -0700
Subject: [PATCH 6/8] Refine codegen
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 2 +-
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +-
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index af895aa84aa89..6f2ad33094dce 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1668,7 +1668,7 @@ class SelectionDAG {
ArrayRef<SDValue> Ops,
MachineMemOperand *MMO,
ISD::MemIndexType IndexType);
- LLVM_ABI SDValue getLoadFFVP(EVT VT, const SDLoc &dl, SDValue Chain,
+ LLVM_ABI SDValue getLoadFFVP(EVT VT, const SDLoc &DL, SDValue Chain,
SDValue Ptr, SDValue Mask, SDValue EVL,
MachineMemOperand *MMO);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8acdb1b93faff..0cb9cbb7263c1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -838,7 +838,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
break;
}
case ISD::VP_LOAD_FF: {
- const VPLoadFFSDNode *LD = cast<VPLoadFFSDNode>(N);
+ const auto *LD = cast<VPLoadFFSDNode>(N);
ID.AddInteger(LD->getMemoryVT().getRawBits());
ID.AddInteger(LD->getRawSubclassData());
ID.AddInteger(LD->getPointerInfo().getAddrSpace());
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5c87ceb2ffd72..30a50da5d0c5d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -12690,7 +12690,7 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
}
SDValue RISCVTargetLowering::lowerLoadFF(SDValue Op, SelectionDAG &DAG) const {
- assert(Op.getResNo() == 0);
+ assert(Op.getResNo() == 0 && "Unexpected result number");
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
>From 34b8c6e0d0295e25ac8dcd30204a798baada3e3f Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Fri, 1 Aug 2025 01:50:12 -0700
Subject: [PATCH 7/8] Address comments
---
llvm/include/llvm/Analysis/Loads.h | 8 ++++
llvm/lib/Analysis/Loads.cpp | 16 +++++++
.../Vectorize/LoopVectorizationLegality.cpp | 45 +++++--------------
.../Transforms/Vectorize/LoopVectorize.cpp | 3 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 -
5 files changed, 37 insertions(+), 37 deletions(-)
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 84564563de8e3..080757b6d1fe0 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -91,6 +91,14 @@ LLVM_ABI bool isDereferenceableReadOnlyLoop(
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
+/// Return true if the loop \p L cannot fault on any iteration and only
+/// contains read-only memory accesses. Also collect loads that are not
+/// guaranteed to be dereferenceable.
+LLVM_ABI bool isReadOnlyLoopWithSafeOrSpeculativeLoads(
+ Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+ SmallVectorImpl<LoadInst *> *SpeculativeLoads,
+ SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
+
/// Return true if we know that executing a load from this value cannot trap.
///
/// If DT and ScanFrom are specified this method performs context-sensitive
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 393f2648de3c9..5774a5c4105bd 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -862,3 +862,19 @@ bool llvm::isDereferenceableReadOnlyLoop(
}
return true;
}
+
+bool llvm::isReadOnlyLoopWithSafeOrSpeculativeLoads(
+ Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+ SmallVectorImpl<LoadInst *> *SpeculativeLoads,
+ SmallVectorImpl<const SCEVPredicate *> *Predicates) {
+ for (BasicBlock *BB : L->blocks()) {
+ for (Instruction &I : *BB) {
+ if (auto *LI = dyn_cast<LoadInst>(&I)) {
+ if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates))
+ SpeculativeLoads->push_back(LI);
+ } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
+ return false;
+ }
+ }
+ return true;
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 6beb0587a5925..4382aff170043 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1772,39 +1772,18 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
Predicates.clear();
SmallVector<LoadInst *, 4> NonDerefLoads;
- unsigned NumLoad = 0;
- for (BasicBlock *BB : TheLoop->blocks()) {
- for (Instruction &I : *BB) {
- if (auto *LI = dyn_cast<LoadInst>(&I)) {
- NumLoad++;
- if (!isDereferenceableAndAlignedInLoop(LI, TheLoop, *PSE.getSE(), *DT,
- AC, &Predicates))
- NonDerefLoads.push_back(LI);
- } else if (I.mayReadFromMemory() || I.mayWriteToMemory() ||
- I.mayThrow()) {
- reportVectorizationFailure(
- "Loop may fault on instructions other than load",
- "Cannot vectorize potentially faulting early exit loop",
- "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
- return false;
- }
- }
- }
- // Checks if non-dereferencible loads are supported.
- if (!NonDerefLoads.empty()) {
- if (!TTI->supportsSpeculativeLoads()) {
- reportVectorizationFailure(
- "Loop may fault",
- "Cannot vectorize potentially faulting early exit loop",
- "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
- return false;
- }
- // Supports single speculative load for now.
- if (NumLoad > 1) {
- reportVectorizationFailure("More than one speculative load",
- "SpeculativeLoadNotSupported", ORE, TheLoop);
- return false;
- }
+ bool HasSafeAccess =
+ TTI->supportsSpeculativeLoads()
+ ? isReadOnlyLoopWithSafeOrSpeculativeLoads(
+ TheLoop, PSE.getSE(), DT, AC, &NonDerefLoads, &Predicates)
+ : isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
+ &Predicates);
+ if (!HasSafeAccess) {
+ reportVectorizationFailure(
+ "Loop may fault",
+ "Cannot vectorize potentially faulting early exit loop",
+ "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+ return false;
}
// Speculative loads need to be unit-stride.
for (LoadInst *LI : NonDerefLoads) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0d6d69a069e1b..3a947f99d13fc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7741,7 +7741,6 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
-
if (Consecutive) {
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
@@ -7767,7 +7766,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
Ptr = VectorPtr;
}
if (Legal->getSpeculativeLoads().contains(I)) {
- LoadInst *Load = dyn_cast<LoadInst>(I);
+ auto *Load = dyn_cast<LoadInst>(I);
return new VPWidenFFLoadRecipe(*Load, Ptr, Mask, VPIRMetadata(*Load, LVer),
I->getDebugLoc());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 086b8b23a67b0..616a42dbb184d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -967,8 +967,6 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
case VPInstruction::FirstActiveLane: {
- if (VF.isScalar())
- return InstructionCost::getInvalid();
// Calculate the cost of determining the lane index.
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
>From 71c04cbef878c1d91eec938223a18c8fb4a32ead Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Mon, 4 Aug 2025 01:36:56 -0700
Subject: [PATCH 8/8] Expand the description on DataWithEVL dependency
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3a947f99d13fc..ddb10d6298fdf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10004,8 +10004,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
"SpeculativeLoadsDisabled", ORE, L);
return false;
}
- // DataWithEVL is needed to lower VPWidenFFLoadRecipe into
- // VPWidenFFLoadEVLRecipe.
+ // VPWidenFFLoadEVLRecipe is currently the only concrete recipe that generates
+ // speculative load intrinsics. Since it relies on the EVL transform,
+ // speculative loads are only supported when tail-folding with EVL is enabled.
if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL ||
PreferPredicateOverEpilogue !=
PreferPredicateTy::PredicateOrDontVectorize) {
More information about the llvm-commits
mailing list