[llvm] [IA][RISCV] Support VP intrinsics in InterleavedAccessPass (PR #120490)
Min-Yih Hsu via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 24 10:39:11 PST 2024
https://github.com/mshockwave updated https://github.com/llvm/llvm-project/pull/120490
>From 07e0a5ca0b8e5f5bf94ff0d9a7b3366ff51778d3 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Wed, 18 Dec 2024 14:06:10 -0800
Subject: [PATCH 1/4] [IA][RISCV] Support VP intrinsics in
InterleavedAccessPass
Teach InterleavedAccessPass to recognize the following patterns:
- vp.store an interleaved scalable vector
- Deinterleaving a scalable vector loaded from vp.load
- Deinterleaving a scalable vector loaded from a vp.strided.load
Upon recognizing these patterns, IA will collect the interleaved /
deinterleaved operands and delegate them over to their respective
newly-added TLI hooks.
For RISC-V, these patterns are lowered into segmented loads/stores
(except when we're interleaving constant splats, in which case a
unit-strde store will be generated)
Right now we only recognized power-of-two (de)interleave cases, in which
(de)interleave4/8 are synthesized from a tree of (de)interleave2.
Co-authored-by: Nikolay Panchenko <nicholas.panchenko at gmail.com>
---
llvm/include/llvm/CodeGen/TargetLowering.h | 42 ++
llvm/lib/CodeGen/InterleavedAccessPass.cpp | 283 ++++++++++
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 410 ++++++++++++++
llvm/lib/Target/RISCV/RISCVISelLowering.h | 12 +
.../scalable-vectors-interleaved-access.ll | 512 ++++++++++++++++++
...able-vectors-strided-interleave-load-32.ll | 161 ++++++
...able-vectors-strided-interleave-load-64.ll | 171 ++++++
7 files changed, 1591 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
create mode 100644 llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-32.ll
create mode 100644 llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-64.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3751aac4df8ead..823f8aa8c9a7ef 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -94,6 +94,7 @@ class TargetRegisterClass;
class TargetRegisterInfo;
class TargetTransformInfo;
class Value;
+class VPIntrinsic;
namespace Sched {
@@ -3152,6 +3153,47 @@ class TargetLoweringBase {
return false;
}
+ /// Lower an interleaved load to target specific intrinsics. Return
+ /// true on success.
+ ///
+ /// \p Load is a vp.load instruction.
+ /// \p Mask is a mask value
+ /// \p DeinterleaveIntrin is vector.deinterleave intrinsic
+ /// \p Factor is the interleave factor.
+ /// \p DeinterleaveRes is a list of deinterleaved results.
+ virtual bool lowerInterleavedScalableLoad(
+ VPIntrinsic *Load, Value *Mask, IntrinsicInst *DeinterleaveIntrin,
+ unsigned Factor, ArrayRef<Value *> DeinterleaveRes) const {
+ return false;
+ }
+
+ /// Lower an interleaved store to target specific intrinsics. Return
+ /// true on success.
+ ///
+ /// \p Store is the vp.store instruction.
+ /// \p Mask is a mask value
+ /// \p InterleaveIntrin is vector.interleave intrinsic
+ /// \p Factor is the interleave factor.
+ /// \p InterleaveOps is a list of values being interleaved.
+ virtual bool lowerInterleavedScalableStore(
+ VPIntrinsic *Store, Value *Mask, IntrinsicInst *InterleaveIntrin,
+ unsigned Factor, ArrayRef<Value *> InterleaveOps) const {
+ return false;
+ }
+
+ /// Lower a deinterleave intrinsic to a target specific strided load
+ /// intrinsic. Return true on success.
+ ///
+ /// \p StridedLoad is the vp.strided.load instruction.
+ /// \p DI is the deinterleave intrinsic.
+ /// \p Factor is the interleave factor.
+ /// \p DeinterleaveRes is a list of deinterleaved results.
+ virtual bool lowerDeinterleaveIntrinsicToStridedLoad(
+ VPIntrinsic *StridedLoad, IntrinsicInst *DI, unsigned Factor,
+ ArrayRef<Value *> DeinterleaveRes) const {
+ return false;
+ }
+
/// Lower a deinterleave intrinsic to a target specific load intrinsic.
/// Return true on success. Currently only supports
/// llvm.vector.deinterleave2
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 8b6e3180986c30..0f3b65b8d9af2f 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -60,6 +60,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
@@ -248,6 +249,186 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
return false;
}
+// For an (de)interleave tree like this:
+//
+// A C B D
+// |___| |___|
+// |_____|
+// |
+// A B C D
+//
+// We will get ABCD at the end while the leave operands/results
+// are ACBD, which are also what we initially collected in
+// getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
+// hooks (e.g. lowerInterleavedScalableLoad) expect ABCD, so we need
+// to reorder them by interleaving these values.
+static void interleaveLeafValues(SmallVectorImpl<Value *> &Leaves) {
+ unsigned Factor = Leaves.size();
+ assert(isPowerOf2_32(Factor) && Factor <= 8 && Factor > 1);
+
+ if (Factor == 2)
+ return;
+
+ SmallVector<Value *, 8> Buffer;
+ if (Factor == 4) {
+ for (unsigned SrcIdx : {0, 2, 1, 3})
+ Buffer.push_back(Leaves[SrcIdx]);
+ } else {
+ // Factor of 8.
+ //
+ // A E C G B F D H
+ // |_| |_| |_| |_|
+ // |___| |___|
+ // |_______|
+ // |
+ // A B C D E F G H
+ for (unsigned SrcIdx : {0, 4, 2, 6, 1, 5, 3, 7})
+ Buffer.push_back(Leaves[SrcIdx]);
+ }
+
+ llvm::copy(Buffer, Leaves.begin());
+}
+
+static unsigned getVectorInterleaveFactor(IntrinsicInst *II,
+ SmallVectorImpl<Value *> &Operands) {
+ if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
+ return 0;
+
+ unsigned Factor = 0;
+
+ // Visit with BFS
+ SmallVector<IntrinsicInst *, 8> Queue;
+ Queue.push_back(II);
+ while (!Queue.empty()) {
+ IntrinsicInst *Current = Queue.front();
+ Queue.erase(Queue.begin());
+
+ for (unsigned I = 0; I < 2; ++I) {
+ Value *Op = Current->getOperand(I);
+ if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
+ if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
+ Queue.push_back(OpII);
+ continue;
+ }
+
+ ++Factor;
+ Operands.push_back(Op);
+ }
+ }
+
+ // Currently we only recognize power-of-two factors.
+ // FIXME: should we assert here instead?
+ if (Factor > 1 && isPowerOf2_32(Factor)) {
+ interleaveLeafValues(Operands);
+ return Factor;
+ }
+ return 0;
+}
+
+/// Check the interleaved mask
+///
+/// - if a value within the optional is non-nullptr, the value corresponds to
+/// deinterleaved mask
+/// - if a value within the option is nullptr, the value corresponds to all-true
+/// mask
+/// - return nullopt if mask cannot be deinterleaved
+static std::optional<Value *> getMask(Value *WideMask, unsigned Factor) {
+ using namespace llvm::PatternMatch;
+ if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
+ SmallVector<Value *, 8> Operands;
+ if (unsigned MaskFactor = getVectorInterleaveFactor(IMI, Operands)) {
+ assert(!Operands.empty());
+ if (MaskFactor == Factor &&
+ std::equal(Operands.begin(), Operands.end(), Operands.begin()))
+ return Operands.front();
+ }
+ }
+ if (match(WideMask, m_AllOnes()))
+ return nullptr;
+ return std::nullopt;
+}
+
+static unsigned getVectorDeInterleaveFactor(IntrinsicInst *II,
+ SmallVectorImpl<Value *> &Results) {
+ using namespace PatternMatch;
+ if (II->getIntrinsicID() != Intrinsic::vector_deinterleave2 ||
+ !II->hasNUses(2))
+ return 0;
+
+ unsigned Factor = 0;
+
+ // Visit with BFS
+ SmallVector<IntrinsicInst *, 8> Queue;
+ Queue.push_back(II);
+ while (!Queue.empty()) {
+ IntrinsicInst *Current = Queue.front();
+ Queue.erase(Queue.begin());
+ assert(Current->hasNUses(2));
+
+ unsigned VisitedIdx = 0;
+ for (User *Usr : Current->users()) {
+ // We're playing safe here and matches only the expression
+ // consisting of a perfectly balanced binary tree in which all
+ // intermediate values are only used once.
+ if (!Usr->hasOneUse() || !isa<ExtractValueInst>(Usr))
+ return 0;
+
+ auto *EV = cast<ExtractValueInst>(Usr);
+ ArrayRef<unsigned> Indices = EV->getIndices();
+ if (Indices.size() != 1 || Indices[0] >= 2)
+ return 0;
+
+ // The idea is that we don't want to have two extractvalue
+ // on the same index. So we XOR (index + 1) onto VisitedIdx
+ // such that if there is any duplication, VisitedIdx will be
+ // zero.
+ VisitedIdx ^= Indices[0] + 1;
+ if (!VisitedIdx)
+ return 0;
+ // We have a legal index. At this point we're either going
+ // to continue the traversal or push the leaf values into Results.
+ // But in either cases we need to follow the order imposed by
+ // ExtractValue's indices and swap with the last element pushed
+ // into Queue/Results if necessary (This is also one of the main
+ // reasons using BFS instead of DFS here, btw).
+
+ // When VisitedIdx equals to 0b11, we're the last visted ExtractValue.
+ // So if the current index is 0, we need to swap. Conversely, when
+ // we're either the first visited ExtractValue or the last operand
+ // in Queue/Results is of index 0, there is no need to swap.
+ bool SwapWithLast = VisitedIdx == 0b11 && Indices[0] == 0;
+
+ // Continue the traversal.
+ if (match(EV->user_back(),
+ m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
+ EV->user_back()->hasNUses(2)) {
+ auto *EVUsr = cast<IntrinsicInst>(EV->user_back());
+ if (SwapWithLast)
+ Queue.insert(Queue.end() - 1, EVUsr);
+ else
+ Queue.push_back(EVUsr);
+ continue;
+ }
+
+ // Save the leaf value.
+ if (SwapWithLast)
+ Results.insert(Results.end() - 1, EV);
+ else
+ Results.push_back(EV);
+
+ ++Factor;
+ }
+ }
+
+ // Currently we only recognize power-of-two factors.
+ // FIXME: should we assert here instead?
+ if (Factor > 1 && isPowerOf2_32(Factor)) {
+ interleaveLeafValues(Results);
+ return Factor;
+ }
+ return 0;
+}
+
bool InterleavedAccessImpl::lowerInterleavedLoad(
LoadInst *LI, SmallVectorImpl<Instruction *> &DeadInsts) {
if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
@@ -480,6 +661,81 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
IntrinsicInst *DI, SmallVectorImpl<Instruction *> &DeadInsts) {
+ using namespace PatternMatch;
+ SmallVector<Value *, 8> DeInterleaveResults;
+ unsigned Factor = getVectorDeInterleaveFactor(DI, DeInterleaveResults);
+
+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(DI->getOperand(0));
+ Factor && VPLoad) {
+ if (!match(VPLoad, m_OneUse(m_Intrinsic<Intrinsic::vp_load>())))
+ return false;
+
+ // Check mask operand. Handle both all-true and interleaved mask.
+ Value *WideMask = VPLoad->getOperand(1);
+ std::optional<Value *> Mask = getMask(WideMask, Factor);
+ if (!Mask)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
+
+ // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
+ // TLI function to emit target-specific interleaved instruction.
+ if (!TLI->lowerInterleavedScalableLoad(VPLoad, *Mask, DI, Factor,
+ DeInterleaveResults))
+ return false;
+
+ DeadInsts.push_back(DI);
+ DeadInsts.push_back(VPLoad);
+ return true;
+ }
+
+ // Match
+ // %x = vp.strided.load ;; VPStridedLoad
+ // %y = bitcast %x ;; BitCast
+ // %y' = inttoptr %y
+ // %z = deinterleave %y ;; DI
+ if (Factor && isa<BitCastInst, IntToPtrInst>(DI->getOperand(0))) {
+ auto *BitCast = cast<Instruction>(DI->getOperand(0));
+ if (!BitCast->hasOneUse())
+ return false;
+
+ Instruction *IntToPtrCast = nullptr;
+ if (auto *BC = dyn_cast<BitCastInst>(BitCast->getOperand(0))) {
+ IntToPtrCast = BitCast;
+ BitCast = BC;
+ }
+
+ // Match the type is
+ // <VF x (factor * elementTy)> bitcast to <(VF * factor) x elementTy>
+ Value *BitCastSrc = BitCast->getOperand(0);
+ auto *BitCastSrcTy = dyn_cast<VectorType>(BitCastSrc->getType());
+ auto *BitCastDstTy = cast<VectorType>(BitCast->getType());
+ if (!BitCastSrcTy || (BitCastSrcTy->getElementCount() * Factor !=
+ BitCastDstTy->getElementCount()))
+ return false;
+
+ if (auto *VPStridedLoad = dyn_cast<VPIntrinsic>(BitCast->getOperand(0))) {
+ if (VPStridedLoad->getIntrinsicID() !=
+ Intrinsic::experimental_vp_strided_load ||
+ !VPStridedLoad->hasOneUse())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI
+ << "\n");
+
+ if (!TLI->lowerDeinterleaveIntrinsicToStridedLoad(
+ VPStridedLoad, DI, Factor, DeInterleaveResults))
+ return false;
+
+ DeadInsts.push_back(DI);
+ if (IntToPtrCast)
+ DeadInsts.push_back(IntToPtrCast);
+ DeadInsts.push_back(BitCast);
+ DeadInsts.push_back(VPStridedLoad);
+ return true;
+ }
+ }
+
LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand(0));
if (!LI || !LI->hasOneUse() || !LI->isSimple())
@@ -502,6 +758,33 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
if (!II->hasOneUse())
return false;
+ if (auto *VPStore = dyn_cast<VPIntrinsic>(*(II->users().begin()))) {
+ if (VPStore->getIntrinsicID() != Intrinsic::vp_store)
+ return false;
+
+ SmallVector<Value *, 8> InterleaveOperands;
+ unsigned Factor = getVectorInterleaveFactor(II, InterleaveOperands);
+ if (!Factor)
+ return false;
+
+ Value *WideMask = VPStore->getOperand(2);
+ std::optional<Value *> Mask = getMask(WideMask, Factor);
+ if (!Mask)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
+
+ // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
+ // TLI function to emit target-specific interleaved instruction.
+ if (!TLI->lowerInterleavedScalableStore(VPStore, *Mask, II, Factor,
+ InterleaveOperands))
+ return false;
+
+ DeadInsts.push_back(VPStore);
+ DeadInsts.push_back(II);
+ return true;
+ }
+
StoreInst *SI = dyn_cast<StoreInst>(*(II->users().begin()));
if (!SI || !SI->isSimple())
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b703eb90e8ef30..2dafbf737512a9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22190,6 +22190,416 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
return true;
}
+/// Lower an interleaved vp.load into a vlsegN intrinsic.
+///
+/// E.g. Lower an interleaved vp.load (Factor = 2):
+/// %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr,
+/// %mask,
+/// i32 %wide.rvl)
+/// %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> }
+/// @llvm.vector.deinterleave2.nxv64i8(
+/// <vscale x 64 x i8> %l)
+/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0
+/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1
+///
+/// Into:
+/// %rvl = udiv %wide.rvl, 2
+/// %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> }
+/// @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef,
+/// <vscale x 32 x i8> undef,
+/// ptr %ptr,
+/// %mask,
+/// i64 %rvl,
+/// i64 1)
+/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0
+/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1
+///
+/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
+/// removed by the caller
+bool RISCVTargetLowering::lowerInterleavedScalableLoad(
+ VPIntrinsic *Load, Value *Mask, IntrinsicInst *DeinterleaveIntrin,
+ unsigned Factor, ArrayRef<Value *> DeInterleaveResults) const {
+ assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
+ "Unexpected intrinsic");
+
+ auto *WideVTy = cast<VectorType>(Load->getType());
+ unsigned WideNumElements = WideVTy->getElementCount().getKnownMinValue();
+ assert(WideNumElements % Factor == 0 &&
+ "ElementCount of a wide load must be divisible by interleave factor");
+ auto *VTy =
+ VectorType::get(WideVTy->getScalarType(), WideNumElements / Factor,
+ WideVTy->isScalableTy());
+ // FIXME: Should pass alignment attribute from pointer, but vectorizer needs
+ // to emit it first.
+ auto &DL = Load->getModule()->getDataLayout();
+ Align Alignment = Align(DL.getTypeStoreSize(WideVTy->getScalarType()));
+ if (!isLegalInterleavedAccessType(
+ VTy, Factor, Alignment,
+ Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
+ return false;
+
+ IRBuilder<> Builder(Load);
+ Value *WideEVL = Load->getArgOperand(2);
+ auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+ Value *EVL = Builder.CreateZExtOrTrunc(
+ Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
+ XLenTy);
+
+ static const Intrinsic::ID IntrMaskIds[] = {
+ Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+ Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+ Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+ Intrinsic::riscv_vlseg8_mask,
+ };
+ static const Intrinsic::ID IntrIds[] = {
+ Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3, Intrinsic::riscv_vlseg4,
+ Intrinsic::riscv_vlseg5, Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
+ Intrinsic::riscv_vlseg8,
+ };
+
+ unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
+ unsigned NumElts = VTy->getElementCount().getKnownMinValue();
+ Type *VecTupTy = TargetExtType::get(
+ Load->getContext(), "riscv.vector.tuple",
+ ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
+ NumElts * SEW / 8),
+ Factor);
+
+ Value *PoisonVal = PoisonValue::get(VecTupTy);
+ SmallVector<Value *> Operands;
+ Operands.append({PoisonVal, Load->getArgOperand(0)});
+
+ Function *VlsegNFunc;
+ if (Mask) {
+ VlsegNFunc = Intrinsic::getOrInsertDeclaration(
+ Load->getModule(), IntrMaskIds[Factor - 2],
+ {VecTupTy, Mask->getType(), EVL->getType()});
+ Operands.push_back(Mask);
+ } else {
+ VlsegNFunc = Intrinsic::getOrInsertDeclaration(
+ Load->getModule(), IntrIds[Factor - 2], {VecTupTy, EVL->getType()});
+ }
+
+ Operands.push_back(EVL);
+
+ // Tail-policy
+ if (Mask)
+ Operands.push_back(ConstantInt::get(XLenTy, 1));
+
+ Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW)));
+
+ CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
+
+ SmallVector<Type *, 8> AggrTypes{Factor, VTy};
+ Value *Return =
+ PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
+ Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
+ Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
+ for (unsigned i = 0; i < Factor; ++i) {
+ Value *VecExtract =
+ Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
+ Return = Builder.CreateInsertValue(Return, VecExtract, i);
+ }
+
+ for (auto [Idx, DIO] : enumerate(DeInterleaveResults)) {
+ // We have to create a brand new ExtractValue to replace each
+ // of these old ExtractValue instructions.
+ Value *NewEV =
+ Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
+ DIO->replaceAllUsesWith(NewEV);
+ }
+ DeinterleaveIntrin->replaceAllUsesWith(
+ UndefValue::get(DeinterleaveIntrin->getType()));
+
+ return true;
+}
+
+/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
+/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
+/// larger splat
+/// `<vscale x 4 x i64> <splat of ((777 << 32) | 666)>` first before casting it
+/// into
+/// `<vscale x 8 x i32>`. This will resuling a simple unit stride store rather
+/// than a segment store, which is more expensive in this case.
+static Value *foldInterleaved2OfConstSplats(IntrinsicInst *InterleaveIntrin,
+ VectorType *VTy,
+ const TargetLowering *TLI,
+ Instruction *VPStore) {
+ // We only handle Factor = 2 for now.
+ assert(InterleaveIntrin->arg_size() == 2);
+ auto *SplatVal0 = dyn_cast_or_null<ConstantInt>(
+ getSplatValue(InterleaveIntrin->getArgOperand(0)));
+ auto *SplatVal1 = dyn_cast_or_null<ConstantInt>(
+ getSplatValue(InterleaveIntrin->getArgOperand(1)));
+ if (!SplatVal0 || !SplatVal1)
+ return nullptr;
+
+ auto &Ctx = VPStore->getContext();
+ auto &DL = VPStore->getModule()->getDataLayout();
+
+ auto *NewVTy = VectorType::getExtendedElementVectorType(VTy);
+ if (!TLI->isTypeLegal(TLI->getValueType(DL, NewVTy)))
+ return nullptr;
+
+ // InterleavedAccessPass will remove VPStore after this but we still want to
+ // preserve it, hence clone another one here.
+ auto *ClonedVPStore = VPStore->clone();
+ ClonedVPStore->insertBefore(VPStore);
+ IRBuilder<> Builder(ClonedVPStore);
+
+ Type *ETy = VTy->getElementType();
+ unsigned Width = ETy->getIntegerBitWidth();
+
+ APInt NewSplatVal(Width * 2, SplatVal1->getZExtValue());
+ NewSplatVal <<= Width;
+ NewSplatVal |= SplatVal0->getZExtValue();
+ auto *NewSplat = ConstantVector::getSplat(NewVTy->getElementCount(),
+ ConstantInt::get(Ctx, NewSplatVal));
+ return Builder.CreateBitCast(NewSplat,
+ VectorType::getDoubleElementsVectorType(VTy));
+}
+
+/// Lower an interleaved vp.store into a vssegN intrinsic.
+///
+/// E.g. Lower an interleaved vp.store (Factor = 2):
+///
+/// %is = tail call <vscale x 64 x i8>
+/// @llvm.vector.interleave2.nxv64i8(
+/// <vscale x 32 x i8> %load0,
+/// <vscale x 32 x i8> %load1
+/// %wide.rvl = shl nuw nsw i32 %rvl, 1
+/// tail call void @llvm.vp.store.nxv64i8.p0(
+/// <vscale x 64 x i8> %is, ptr %ptr,
+/// %mask,
+/// i32 %wide.rvl)
+///
+/// Into:
+/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64(
+/// <vscale x 32 x i8> %load1,
+/// <vscale x 32 x i8> %load2, ptr %ptr,
+/// %mask,
+/// i64 %rvl)
+bool RISCVTargetLowering::lowerInterleavedScalableStore(
+ VPIntrinsic *Store, Value *Mask, IntrinsicInst *InterleaveIntrin,
+ unsigned Factor, ArrayRef<Value *> InterleaveOperands) const {
+ assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
+ "Unexpected intrinsic");
+
+ VectorType *VTy = cast<VectorType>(InterleaveOperands[0]->getType());
+
+ // FIXME: Should pass alignment attribute from pointer, but vectorizer needs
+ // to emit it first.
+ const DataLayout &DL = Store->getDataLayout();
+ Align Alignment = Align(DL.getTypeStoreSize(VTy->getScalarType()));
+ if (!isLegalInterleavedAccessType(
+ VTy, Factor, Alignment,
+ Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
+ return false;
+
+ if (Factor == 2)
+ if (Value *BC =
+ foldInterleaved2OfConstSplats(InterleaveIntrin, VTy, this, Store)) {
+ InterleaveIntrin->replaceAllUsesWith(BC);
+ return true;
+ }
+
+ IRBuilder<> Builder(Store);
+ Value *WideEVL = Store->getArgOperand(3);
+ auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
+ Value *EVL = Builder.CreateZExtOrTrunc(
+ Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
+ XLenTy);
+
+ static const Intrinsic::ID IntrMaskIds[] = {
+ Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+ Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+ Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+ Intrinsic::riscv_vsseg8_mask,
+ };
+ static const Intrinsic::ID IntrIds[] = {
+ Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3, Intrinsic::riscv_vsseg4,
+ Intrinsic::riscv_vsseg5, Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7,
+ Intrinsic::riscv_vsseg8,
+ };
+
+ unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
+ unsigned NumElts = VTy->getElementCount().getKnownMinValue();
+ Type *VecTupTy = TargetExtType::get(
+ Store->getContext(), "riscv.vector.tuple",
+ ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
+ NumElts * SEW / 8),
+ Factor);
+
+ Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
+ Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy});
+ Value *StoredVal = PoisonValue::get(VecTupTy);
+ for (unsigned i = 0; i < Factor; ++i)
+ StoredVal = Builder.CreateCall(
+ VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});
+
+ SmallVector<Value *, 5> Operands;
+ Operands.push_back(StoredVal);
+ Operands.push_back(Store->getArgOperand(1));
+
+ Function *VssegNFunc;
+ if (Mask) {
+ VssegNFunc = Intrinsic::getOrInsertDeclaration(
+ Store->getModule(), IntrMaskIds[Factor - 2],
+ {VecTupTy, Mask->getType(), EVL->getType()});
+ Operands.push_back(Mask);
+ } else {
+ VssegNFunc = Intrinsic::getOrInsertDeclaration(
+ Store->getModule(), IntrIds[Factor - 2], {VecTupTy, EVL->getType()});
+ }
+
+ Operands.push_back(EVL);
+ Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW)));
+
+ Builder.CreateCall(VssegNFunc, Operands);
+ return true;
+}
+
+/// Lower an interleaved vp.strided.load into a vlssegN intrinsic.
+///
+/// E.g. Lower an interleaved vp.strided.load (Factor = 2):
+/// %l = call <vscale x 2 x i16>
+/// @llvm.experimental.vp.strided.load.nxv2i16.p0.i64(ptr %ptr,
+/// %stride,
+/// <all-true-mask>,
+/// i32 %rvl)
+/// %l.cast = bitcast <vscale x 2 x i16> %l to <vscale x 4 x i8>
+/// %dl = tail call { <vscale x 2 x i8>, <vscale x 2 x i8> }
+/// @llvm.vector.deinterleave2.nxv2i8(
+/// <vscale x 4 x i8> %l.cast)
+/// %r0 = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i8> } %dl, 0
+/// %r1 = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i8> } %dl, 1
+///
+/// Into:
+/// %ssl = call { <vscale x 2 x i8>, <vscale x 2 x i8> }
+/// @llvm.riscv.vlseg2.nxv2i8.i64(<vscale x 32 x i8> poison,
+/// <vscale x 32 x i8> poison,
+/// %ptr,
+/// %stride,
+/// i64 %rvl)
+/// %r0 = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i8> } %ssl, 0
+/// %r1 = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i8> } %ssl, 1
+///
+/// NOTE: the deinterleave2 intrinsic and the bitcast instruction won't be
+/// touched and is expected to be removed by the caller
+bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToStridedLoad(
+ VPIntrinsic *StridedLoad, IntrinsicInst *DI, unsigned Factor,
+ ArrayRef<Value *> DeInterleaveResults) const {
+ using namespace llvm::PatternMatch;
+ Value *BasePtr, *Stride, *Mask, *EVL;
+ if (!match(StridedLoad, m_Intrinsic<Intrinsic::experimental_vp_strided_load>(
+ m_Value(BasePtr), m_Value(Stride), m_Value(Mask),
+ m_Value(EVL))))
+ return false;
+
+ [[maybe_unused]] auto *DISrcTy =
+ cast<VectorType>(DI->getOperand(0)->getType());
+ [[maybe_unused]] auto *LTy = cast<VectorType>(StridedLoad->getType());
+ auto &DL = StridedLoad->getModule()->getDataLayout();
+ assert(DL.getTypeAllocSizeInBits(DISrcTy) == DL.getTypeAllocSizeInBits(LTy) &&
+ "The primitive size of strided load and the source of deinterleave "
+ "should be the same.");
+ assert(DISrcTy->getElementCount() == LTy->getElementCount() * Factor &&
+ "ElementCount of source deinterleave should be equal to the "
+ "ElementCount of strided load multiplied by factor.");
+
+ auto *ResTy = cast<VectorType>(DeInterleaveResults[0]->getType());
+
+ Align Alignment =
+ cast<VPIntrinsic>(StridedLoad)->getPointerAlignment().valueOrOne();
+ if (!isLegalInterleavedAccessType(
+ ResTy, Factor, Alignment,
+ BasePtr->getType()->getPointerAddressSpace(), DL))
+ return false;
+
+ IRBuilder<> Builder(StridedLoad);
+ auto *XLenTy =
+ Type::getIntNTy(StridedLoad->getContext(), Subtarget.getXLen());
+ assert(Stride->getType() == XLenTy &&
+ "The type of stride must be the XLEN integer type.");
+ EVL = Builder.CreateZExtOrTrunc(EVL, XLenTy);
+
+ static const Intrinsic::ID IntrMaskIds[] = {
+ Intrinsic::riscv_vlsseg2_mask, Intrinsic::riscv_vlsseg3_mask,
+ Intrinsic::riscv_vlsseg4_mask, Intrinsic::riscv_vlsseg5_mask,
+ Intrinsic::riscv_vlsseg6_mask, Intrinsic::riscv_vlsseg7_mask,
+ Intrinsic::riscv_vlsseg8_mask,
+ };
+
+ static const Intrinsic::ID IntrIds[] = {
+ Intrinsic::riscv_vlsseg2, Intrinsic::riscv_vlsseg3,
+ Intrinsic::riscv_vlsseg4, Intrinsic::riscv_vlsseg5,
+ Intrinsic::riscv_vlsseg6, Intrinsic::riscv_vlsseg7,
+ Intrinsic::riscv_vlsseg8,
+ };
+
+ unsigned SEW = DL.getTypeSizeInBits(ResTy->getElementType());
+ unsigned NumElts = ResTy->getElementCount().getKnownMinValue();
+ Type *VecTupTy = TargetExtType::get(
+ StridedLoad->getContext(), "riscv.vector.tuple",
+ ScalableVectorType::get(Type::getInt8Ty(StridedLoad->getContext()),
+ NumElts * SEW / 8),
+ Factor);
+
+ Value *PoisonVal = PoisonValue::get(VecTupTy);
+ SmallVector<Value *, 7> Operands;
+ Operands.append({PoisonVal, BasePtr, Stride});
+
+ Intrinsic::ID VlssegNID = IntrIds[Factor - 2];
+ bool IsMasked = !match(Mask, m_AllOnes());
+ if (IsMasked) {
+ VlssegNID = IntrMaskIds[Factor - 2];
+ Operands.push_back(Mask);
+ }
+
+ Operands.push_back(EVL);
+
+ // Set the tail policy to tail-agnostic, mask-agnostic (tama) for masked
+ // intrinsics
+ if (IsMasked)
+ Operands.push_back(ConstantInt::get(XLenTy, 3));
+
+ Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW)));
+
+ Function *VlssegNFunc;
+ if (IsMasked) {
+ VlssegNFunc = Intrinsic::getOrInsertDeclaration(
+ StridedLoad->getModule(), VlssegNID,
+ {VecTupTy, EVL->getType(), Mask->getType()});
+ } else {
+ VlssegNFunc = Intrinsic::getOrInsertDeclaration(
+ StridedLoad->getModule(), VlssegNID, {VecTupTy, EVL->getType()});
+ }
+ CallInst *VlssegN = Builder.CreateCall(VlssegNFunc, Operands);
+
+ SmallVector<Type *, 8> AggrTypes{Factor, ResTy};
+ Value *Return =
+ PoisonValue::get(StructType::get(StridedLoad->getContext(), AggrTypes));
+ Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
+ StridedLoad->getModule(), Intrinsic::riscv_tuple_extract,
+ {ResTy, VecTupTy});
+ for (unsigned i = 0; i < Factor; ++i) {
+ Value *VecExtract =
+ Builder.CreateCall(VecExtractFunc, {VlssegN, Builder.getInt32(i)});
+ Return = Builder.CreateInsertValue(Return, VecExtract, i);
+ }
+
+ for (auto [Idx, DIO] : enumerate(DeInterleaveResults)) {
+ // We have to create a brand new ExtractValue to replace each
+ // of these old ExtractValue instructions.
+ Value *NewEV =
+ Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
+ DIO->replaceAllUsesWith(NewEV);
+ }
+ DI->replaceAllUsesWith(UndefValue::get(DI->getType()));
+
+ return true;
+}
+
MachineInstr *
RISCVTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
MachineBasicBlock::instr_iterator &MBBI,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 0944bb8793a949..b11def055ba7fa 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -909,6 +909,18 @@ class RISCVTargetLowering : public TargetLowering {
IntrinsicInst *II, StoreInst *SI,
SmallVectorImpl<Instruction *> &DeadInsts) const override;
+ bool lowerInterleavedScalableLoad(
+ VPIntrinsic *Load, Value *Mask, IntrinsicInst *DeinterleaveIntrin,
+ unsigned Factor, ArrayRef<Value *> DeinterleaveRes) const override;
+
+ bool lowerInterleavedScalableStore(
+ VPIntrinsic *Store, Value *Mask, IntrinsicInst *InterleaveIntrin,
+ unsigned Factor, ArrayRef<Value *> InterleaveOps) const override;
+
+ bool lowerDeinterleaveIntrinsicToStridedLoad(
+ VPIntrinsic *StridedLoad, IntrinsicInst *DI, unsigned Factor,
+ ArrayRef<Value *> DeinterleaveRes) const override;
+
bool supportKCFIBundles() const override { return true; }
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
new file mode 100644
index 00000000000000..4fcfefcdfcaa07
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
@@ -0,0 +1,512 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV32 %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV64 %s
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %rvl) {
+; RV32-LABEL: load_factor2_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg2e32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: load_factor2_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: srliw a1, a1, 1
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg2e32.v v8, (a0)
+; RV64-NEXT: ret
+ %wide.masked.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %rvl)
+ %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor4_v2(ptr %ptr, i32 %rvl) {
+; RV32-LABEL: load_factor4_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg4e32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: load_factor4_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: srliw a1, a1, 2
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg4e32.v v8, (a0)
+; RV64-NEXT: ret
+ %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor8_v2(ptr %ptr, i32 %rvl) {
+; RV32-LABEL: load_factor8_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: srli a1, a1, 3
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg8e32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: load_factor8_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: srliw a1, a1, 3
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg8e32.v v8, (a0)
+; RV64-NEXT: ret
+ %wide.masked.load = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i32 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 %rvl)
+ %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.masked.load)
+ %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
+ %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
+ %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
+ %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
+ %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
+ %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
+ %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
+
+ %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
+ %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
+ %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
+ %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
+ %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
+ %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
+ %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
+ %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ %res4 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3, <vscale x 2 x i32> %t4, 4
+ %res5 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res4, <vscale x 2 x i32> %t5, 5
+ %res6 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res5, <vscale x 2 x i32> %t6, 6
+ %res7 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res6, <vscale x 2 x i32> %t7, 7
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res7
+}
+
+define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: store_factor2_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vsseg2e32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: store_factor2_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: srliw a1, a1, 1
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vsseg2e32.v v8, (a0)
+; RV64-NEXT: ret
+ %interleaved.vec = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
+ call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+ ret void
+}
+
+; Expecting uni-strde store here rather than segmented store.
+define void @store_factor2_const_splat(ptr %dst) {
+; RV32-LABEL: store_factor2_const_splat:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: li a1, 777
+; RV32-NEXT: li a2, 666
+; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: addi a1, sp, 8
+; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v8, (a1), zero
+; RV32-NEXT: li a1, 87
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: store_factor2_const_splat:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 777
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: addi a1, a1, 666
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: li a1, 87
+; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: ret
+ %interleave2 = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(
+ <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 666, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer),
+ <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 777, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+ )
+ call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> %interleave2, ptr %dst,
+ <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 1, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer),
+ i32 87)
+ ret void
+}
+
+define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: store_factor4_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v10, v8
+; RV32-NEXT: vmv1r.v v11, v9
+; RV32-NEXT: vsseg4e32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: store_factor4_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: srliw a1, a1, 2
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v10, v8
+; RV64-NEXT: vmv1r.v v11, v9
+; RV64-NEXT: vsseg4e32.v v8, (a0)
+; RV64-NEXT: ret
+ %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+ %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
+ %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
+ call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec2, ptr %ptr, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %rvl)
+ ret void
+}
+
+define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: store_factor8_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: srli a1, a1, 3
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v10, v8
+; RV32-NEXT: vmv1r.v v11, v9
+; RV32-NEXT: vmv1r.v v12, v8
+; RV32-NEXT: vmv1r.v v13, v9
+; RV32-NEXT: vmv1r.v v14, v8
+; RV32-NEXT: vmv1r.v v15, v9
+; RV32-NEXT: vsseg8e32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: store_factor8_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: srliw a1, a1, 3
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v10, v8
+; RV64-NEXT: vmv1r.v v11, v9
+; RV64-NEXT: vmv1r.v v12, v8
+; RV64-NEXT: vmv1r.v v13, v9
+; RV64-NEXT: vmv1r.v v14, v8
+; RV64-NEXT: vmv1r.v v15, v9
+; RV64-NEXT: vsseg8e32.v v8, (a0)
+; RV64-NEXT: ret
+ %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+ %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+ %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
+ %interleaved.vec3 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
+ %interleaved.vec4 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
+ %interleaved.vec5 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec3, <vscale x 2 x i32> %interleaved.vec4)
+ %interleaved.vec6 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %interleaved.vec2, <vscale x 4 x i32> %interleaved.vec5)
+ call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec6, ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+ ret void
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor2_v2(<vscale x 2 x i1> %mask, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: masked_load_factor2_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg2e32.v v8, (a0), v0.t
+; RV32-NEXT: ret
+;
+; RV64-LABEL: masked_load_factor2_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: srliw a1, a1, 1
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg2e32.v v8, (a0), v0.t
+; RV64-NEXT: ret
+ %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+ %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor4_v2(<vscale x 2 x i1> %mask, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: masked_load_factor4_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg4e32.v v8, (a0), v0.t
+; RV32-NEXT: ret
+;
+; RV64-LABEL: masked_load_factor4_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: srliw a1, a1, 2
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg4e32.v v8, (a0), v0.t
+; RV64-NEXT: ret
+ %interleaved.mask0 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %interleaved.mask1 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %interleaved.mask2 = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %interleaved.mask0, <vscale x 4 x i1> %interleaved.mask1)
+ %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %interleaved.mask2, i32 %rvl)
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+define void @masked_store_factor2_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: masked_store_factor2_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v9, v8
+; RV32-NEXT: vsseg2e32.v v8, (a0), v0.t
+; RV32-NEXT: ret
+;
+; RV64-LABEL: masked_store_factor2_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: srliw a1, a1, 1
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v9, v8
+; RV64-NEXT: vsseg2e32.v v8, (a0), v0.t
+; RV64-NEXT: ret
+ %interleaved.mask = tail call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
+ %interleaved.vec = tail call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+ tail call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> %interleaved.mask, i32 %rvl)
+ ret void
+}
+
+define void @masked_load_store_factor2_v2_shared_mask(<vscale x 2 x i1> %mask, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: masked_load_store_factor2_v2_shared_mask:
+; RV32: # %bb.0:
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg2e32.v v8, (a0), v0.t
+; RV32-NEXT: vsseg2e32.v v8, (a0), v0.t
+; RV32-NEXT: ret
+;
+; RV64-LABEL: masked_load_store_factor2_v2_shared_mask:
+; RV64: # %bb.0:
+; RV64-NEXT: srliw a1, a1, 1
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg2e32.v v8, (a0), v0.t
+; RV64-NEXT: vsseg2e32.v v8, (a0), v0.t
+; RV64-NEXT: ret
+ %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+ %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %interleaved.vec = tail call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %t0, <vscale x 2 x i32> %t1)
+ tail call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+ ret void
+}
+
+define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %mask, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: masked_load_store_factor2_v2_shared_mask_extract:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT: vmv1r.v v8, v0
+; RV32-NEXT: vmv.v.i v9, 0
+; RV32-NEXT: li a2, -1
+; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmv.v.i v10, 0
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT: vmerge.vim v11, v9, 1, v0
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: vwaddu.vv v12, v11, v11
+; RV32-NEXT: vwmaccu.vx v12, a2, v11
+; RV32-NEXT: vmsne.vi v0, v12, 0
+; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT: vslidedown.vx v11, v12, a3
+; RV32-NEXT: vmerge.vim v10, v10, 1, v0
+; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT: vmsne.vi v0, v11, 0
+; RV32-NEXT: add a2, a3, a3
+; RV32-NEXT: vmerge.vim v9, v9, 1, v0
+; RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma
+; RV32-NEXT: vslideup.vx v10, v9, a3
+; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmsne.vi v0, v10, 0
+; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; RV32-NEXT: vle32.v v10, (a0), v0.t
+; RV32-NEXT: li a2, 32
+; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV32-NEXT: vnsrl.wx v13, v10, a2
+; RV32-NEXT: vmv.x.s a2, v10
+; RV32-NEXT: vnsrl.wi v12, v10, 0
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vmv1r.v v0, v8
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vsseg2e32.v v12, (a0), v0.t
+; RV32-NEXT: mv a0, a2
+; RV32-NEXT: ret
+;
+; RV64-LABEL: masked_load_store_factor2_v2_shared_mask_extract:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT: vmv1r.v v8, v0
+; RV64-NEXT: vmv.v.i v9, 0
+; RV64-NEXT: li a2, -1
+; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmv.v.i v10, 0
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
+; RV64-NEXT: vmerge.vim v11, v9, 1, v0
+; RV64-NEXT: srli a3, a3, 2
+; RV64-NEXT: vwaddu.vv v12, v11, v11
+; RV64-NEXT: vwmaccu.vx v12, a2, v11
+; RV64-NEXT: vmsne.vi v0, v12, 0
+; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT: vslidedown.vx v11, v12, a3
+; RV64-NEXT: vmerge.vim v10, v10, 1, v0
+; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT: vmsne.vi v0, v11, 0
+; RV64-NEXT: add a2, a3, a3
+; RV64-NEXT: vmerge.vim v9, v9, 1, v0
+; RV64-NEXT: vsetvli zero, a2, e8, mf2, ta, ma
+; RV64-NEXT: vslideup.vx v10, v9, a3
+; RV64-NEXT: slli a2, a1, 32
+; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmsne.vi v0, v10, 0
+; RV64-NEXT: srli a2, a2, 32
+; RV64-NEXT: vsetvli zero, a2, e32, m2, ta, ma
+; RV64-NEXT: vle32.v v10, (a0), v0.t
+; RV64-NEXT: li a2, 32
+; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV64-NEXT: vnsrl.wx v13, v10, a2
+; RV64-NEXT: vmv.x.s a2, v10
+; RV64-NEXT: vnsrl.wi v12, v10, 0
+; RV64-NEXT: srliw a1, a1, 1
+; RV64-NEXT: vmv1r.v v0, v8
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vsseg2e32.v v12, (a0), v0.t
+; RV64-NEXT: mv a0, a2
+; RV64-NEXT: ret
+ %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+ %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %r0 = extractelement <vscale x 4 x i32> %wide.masked.load, i32 0
+ %interleaved.vec = tail call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %t0, <vscale x 2 x i32> %t1)
+ tail call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+ ret i32 %r0
+}
+
+define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: masked_store_factor4_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v10, v8
+; RV32-NEXT: vmv1r.v v11, v9
+; RV32-NEXT: vsseg4e32.v v8, (a0), v0.t
+; RV32-NEXT: ret
+;
+; RV64-LABEL: masked_store_factor4_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: srliw a1, a1, 2
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v10, v8
+; RV64-NEXT: vmv1r.v v11, v9
+; RV64-NEXT: vsseg4e32.v v8, (a0), v0.t
+; RV64-NEXT: ret
+ %interleaved.mask0 = call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
+ %interleaved.mask1 = call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
+ %interleaved.mask2 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %interleaved.mask0, <vscale x 2 x i1> %interleaved.mask1)
+ %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+ %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
+ %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
+ call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec2, ptr %ptr, <vscale x 4 x i1> %interleaved.mask2, i32 %rvl)
+ ret void
+}
+
+; Negative tests
+
+; We should not transform this function because the deinterleave tree is not in a desired form.
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @incorrect_extract_value_index(ptr %ptr, i32 %rvl) {
+; RV32-LABEL: incorrect_extract_value_index:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: li a0, 32
+; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT: vnsrl.wi v12, v8, 0
+; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: vnsrl.wx v9, v12, a0
+; RV32-NEXT: vnsrl.wi v8, v12, 0
+; RV32-NEXT: vmv.v.v v10, v9
+; RV32-NEXT: vmv.v.v v11, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: incorrect_extract_value_index:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: li a0, 32
+; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; RV64-NEXT: vnsrl.wi v12, v8, 0
+; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT: vnsrl.wx v9, v12, a0
+; RV64-NEXT: vnsrl.wi v8, v12, 0
+; RV64-NEXT: vmv.v.v v10, v9
+; RV64-NEXT: vmv.v.v v11, v9
+; RV64-NEXT: ret
+ %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-32.ll b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-32.ll
new file mode 100644
index 00000000000000..3464376723133b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-32.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK %s
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @strided_load_factor2_v2(ptr %ptr, i32 %stride, i32 %rvl) {
+; CHECK-LABEL: strided_load_factor2_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1
+; CHECK-NEXT: ret
+ %wide.strided.load = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i32(ptr align 4 %ptr, i32 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+ %wide.strided.load.cast = bitcast <vscale x 2 x i64> %wide.strided.load to <vscale x 4 x i32>
+ %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.strided.load.cast)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @strided_load_factor4_v2(ptr %ptr, i32 %stride, i32 %rvl) {
+; CHECK-LABEL: strided_load_factor4_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vlsseg4e32.v v8, (a0), a1
+; CHECK-NEXT: ret
+ %wide.strided.load = call <vscale x 2 x i128> @llvm.experimental.vp.strided.load.nxv2i128.p0.i32(ptr align 4 %ptr, i32 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+ %wide.strided.load.cast = bitcast <vscale x 2 x i128> %wide.strided.load to <vscale x 8 x i32>
+
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.strided.load.cast)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @strided_load_factor8_v2(ptr %ptr, i32 %stride, i32 %rvl) {
+; CHECK-LABEL: strided_load_factor8_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vlsseg8e32.v v8, (a0), a1
+; CHECK-NEXT: ret
+ %wide.strided.load = call <vscale x 2 x i256> @llvm.experimental.vp.strided.load.nxv2i256.p0.i32(ptr align 4 %ptr, i32 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+ %wide.strided.load.cast = bitcast <vscale x 2 x i256> %wide.strided.load to <vscale x 16 x i32>
+ %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.strided.load.cast)
+ %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
+ %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
+ %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
+ %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
+ %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
+ %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
+ %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
+
+ %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
+ %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
+ %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
+ %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
+ %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
+ %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
+ %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
+ %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ %res4 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3, <vscale x 2 x i32> %t4, 4
+ %res5 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res4, <vscale x 2 x i32> %t5, 5
+ %res6 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res5, <vscale x 2 x i32> %t6, 6
+ %res7 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res6, <vscale x 2 x i32> %t7, 7
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res7
+}
+
+define {<vscale x 2 x ptr>, <vscale x 2 x ptr>} @strided_load_factor2_v2_ptr(ptr %ptr, i32 %stride, i32 %rvl) {
+; CHECK-LABEL: strided_load_factor2_v2_ptr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1
+; CHECK-NEXT: ret
+ %wide.strided.load = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 %ptr, i32 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+ %wide.strided.load.intcast = bitcast <vscale x 2 x i64> %wide.strided.load to <vscale x 4 x i32>
+ %wide.strided.load.cast = inttoptr <vscale x 4 x i32> %wide.strided.load.intcast to <vscale x 4 x ptr>
+ %deinterleaved.results = call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %wide.strided.load.cast)
+ %t0 = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleaved.results, 1
+ %res0 = insertvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } undef, <vscale x 2 x ptr> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %res0, <vscale x 2 x ptr> %t1, 1
+ ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %res1
+}
+
+; Negative test
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @noalign_info_factor2_v2(ptr %ptr, i32 %stride, i32 %rvl) {
+; CHECK-LABEL: noalign_info_factor2_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-NEXT: vlse64.v v10, (a0), a1
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vnsrl.wx v9, v10, a0
+; CHECK-NEXT: vnsrl.wi v8, v10, 0
+; CHECK-NEXT: ret
+ %wide.strided.load = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i32(ptr %ptr, i32 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+ %wide.strided.load.cast = bitcast <vscale x 2 x i64> %wide.strided.load to <vscale x 4 x i32>
+ %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.strided.load.cast)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+declare <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i32(ptr nocapture, i32, <vscale x 4 x i1>, i32)
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @not_vlsseg_factor4_v2(ptr %ptr, i32 %stride, i32 %rvl) {
+; CHECK-LABEL: not_vlsseg_factor4_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-NEXT: vlse64.v v8, (a0), a1
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-NEXT: vnsrl.wx v12, v8, a0
+; CHECK-NEXT: vnsrl.wi v14, v8, 0
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vnsrl.wx v10, v14, a0
+; CHECK-NEXT: vnsrl.wi v8, v14, 0
+; CHECK-NEXT: vnsrl.wx v11, v12, a0
+; CHECK-NEXT: vnsrl.wi v9, v12, 0
+; CHECK-NEXT: ret
+ %wide.strided.load = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i32(ptr align 4 %ptr, i32 %stride, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %rvl)
+ %wide.strided.load.cast = bitcast <vscale x 4 x i64> %wide.strided.load to <vscale x 8 x i32>
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.strided.load.cast)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-64.ll b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-64.ll
new file mode 100644
index 00000000000000..39d76e61b96b47
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-64.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK %s
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @strided_load_factor2_v2(ptr %ptr, i64 %stride, i32 %rvl) {
+; CHECK-LABEL: strided_load_factor2_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a2, a2, 32
+; CHECK-NEXT: srli a2, a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1
+; CHECK-NEXT: ret
+ %wide.strided.load = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 4 %ptr, i64 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+ %wide.strided.load.cast = bitcast <vscale x 2 x i64> %wide.strided.load to <vscale x 4 x i32>
+ %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.strided.load.cast)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @strided_load_factor4_v2(ptr %ptr, i64 %stride, i32 %rvl) {
+; CHECK-LABEL: strided_load_factor4_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a2, a2, 32
+; CHECK-NEXT: srli a2, a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vlsseg4e32.v v8, (a0), a1
+; CHECK-NEXT: ret
+ %wide.strided.load = call <vscale x 2 x i128> @llvm.experimental.vp.strided.load.nxv2i128.p0.i64(ptr align 4 %ptr, i64 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+ %wide.strided.load.cast = bitcast <vscale x 2 x i128> %wide.strided.load to <vscale x 8 x i32>
+
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.strided.load.cast)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @strided_load_factor8_v2(ptr %ptr, i64 %stride, i32 %rvl) {
+; CHECK-LABEL: strided_load_factor8_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a2, a2, 32
+; CHECK-NEXT: srli a2, a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vlsseg8e32.v v8, (a0), a1
+; CHECK-NEXT: ret
+ %wide.strided.load = call <vscale x 2 x i256> @llvm.experimental.vp.strided.load.nxv2i256.p0.i64(ptr align 4 %ptr, i64 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+ %wide.strided.load.cast = bitcast <vscale x 2 x i256> %wide.strided.load to <vscale x 16 x i32>
+ %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.strided.load.cast)
+ %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
+ %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
+ %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
+ %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
+ %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
+ %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
+ %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
+
+ %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
+ %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
+ %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
+ %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
+ %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
+ %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
+ %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
+ %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ %res4 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3, <vscale x 2 x i32> %t4, 4
+ %res5 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res4, <vscale x 2 x i32> %t5, 5
+ %res6 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res5, <vscale x 2 x i32> %t6, 6
+ %res7 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res6, <vscale x 2 x i32> %t7, 7
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res7
+}
+
+define {<vscale x 2 x ptr>, <vscale x 2 x ptr>} @strided_load_factor2_v2_ptr(ptr %ptr, i64 %stride, i32 %rvl) {
+; CHECK-LABEL: strided_load_factor2_v2_ptr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a2, a2, 32
+; CHECK-NEXT: srli a2, a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-NEXT: vlsseg2e64.v v8, (a0), a1
+; CHECK-NEXT: ret
+ %wide.strided.load = call <vscale x 2 x i128> @llvm.experimental.vp.strided.load.nxv2i128.p0.i64(ptr align 8 %ptr, i64 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+ %wide.strided.load.intcast = bitcast <vscale x 2 x i128> %wide.strided.load to <vscale x 4 x i64>
+ %wide.strided.load.cast = inttoptr <vscale x 4 x i64> %wide.strided.load.intcast to <vscale x 4 x ptr>
+ %deinterleaved.results = call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %wide.strided.load.cast)
+ %t0 = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleaved.results, 1
+ %res0 = insertvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } undef, <vscale x 2 x ptr> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %res0, <vscale x 2 x ptr> %t1, 1
+ ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %res1
+}
+
+; Negative test
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @noalign_info_factor2_v2(ptr %ptr, i64 %stride, i32 %rvl) {
+; CHECK-LABEL: noalign_info_factor2_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a2, a2, 32
+; CHECK-NEXT: srli a2, a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-NEXT: vlse64.v v10, (a0), a1
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vnsrl.wx v9, v10, a0
+; CHECK-NEXT: vnsrl.wi v8, v10, 0
+; CHECK-NEXT: ret
+ %wide.strided.load = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr %ptr, i64 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+ %wide.strided.load.cast = bitcast <vscale x 2 x i64> %wide.strided.load to <vscale x 4 x i32>
+ %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.strided.load.cast)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @not_vlsseg_factor4_v2(ptr %ptr, i64 %stride, i32 %rvl) {
+; CHECK-LABEL: not_vlsseg_factor4_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a2, a2, 32
+; CHECK-NEXT: srli a2, a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-NEXT: vlse64.v v8, (a0), a1
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-NEXT: vnsrl.wx v12, v8, a0
+; CHECK-NEXT: vnsrl.wi v14, v8, 0
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vnsrl.wx v10, v14, a0
+; CHECK-NEXT: vnsrl.wi v8, v14, 0
+; CHECK-NEXT: vnsrl.wx v11, v12, a0
+; CHECK-NEXT: vnsrl.wi v9, v12, 0
+; CHECK-NEXT: ret
+ %wide.strided.load = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr align 4 %ptr, i64 %stride, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %rvl)
+ %wide.strided.load.cast = bitcast <vscale x 4 x i64> %wide.strided.load to <vscale x 8 x i32>
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.strided.load.cast)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
>From 66d0defbe9625f877b1a5157967b871b4e530f91 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Thu, 19 Dec 2024 10:04:44 -0800
Subject: [PATCH 2/4] Split away the strided segmented load implementation
We're going to put it in a separate patch in the future.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 13 --
llvm/lib/CodeGen/InterleavedAccessPass.cpp | 58 +-----
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 141 ---------------
llvm/lib/Target/RISCV/RISCVISelLowering.h | 4 -
...able-vectors-strided-interleave-load-32.ll | 161 -----------------
...able-vectors-strided-interleave-load-64.ll | 171 ------------------
6 files changed, 6 insertions(+), 542 deletions(-)
delete mode 100644 llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-32.ll
delete mode 100644 llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-64.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 823f8aa8c9a7ef..9a4b39ac6ca069 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3181,19 +3181,6 @@ class TargetLoweringBase {
return false;
}
- /// Lower a deinterleave intrinsic to a target specific strided load
- /// intrinsic. Return true on success.
- ///
- /// \p StridedLoad is the vp.strided.load instruction.
- /// \p DI is the deinterleave intrinsic.
- /// \p Factor is the interleave factor.
- /// \p DeinterleaveRes is a list of deinterleaved results.
- virtual bool lowerDeinterleaveIntrinsicToStridedLoad(
- VPIntrinsic *StridedLoad, IntrinsicInst *DI, unsigned Factor,
- ArrayRef<Value *> DeinterleaveRes) const {
- return false;
- }
-
/// Lower a deinterleave intrinsic to a target specific load intrinsic.
/// Return true on success. Currently only supports
/// llvm.vector.deinterleave2
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 0f3b65b8d9af2f..dc881d0cdd1f41 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -661,12 +661,13 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
IntrinsicInst *DI, SmallVectorImpl<Instruction *> &DeadInsts) {
- using namespace PatternMatch;
- SmallVector<Value *, 8> DeInterleaveResults;
- unsigned Factor = getVectorDeInterleaveFactor(DI, DeInterleaveResults);
+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(DI->getOperand(0))) {
+ SmallVector<Value *, 8> DeInterleaveResults;
+ unsigned Factor = getVectorDeInterleaveFactor(DI, DeInterleaveResults);
+ if (!Factor)
+ return false;
- if (auto *VPLoad = dyn_cast<VPIntrinsic>(DI->getOperand(0));
- Factor && VPLoad) {
+ using namespace PatternMatch;
if (!match(VPLoad, m_OneUse(m_Intrinsic<Intrinsic::vp_load>())))
return false;
@@ -689,53 +690,6 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
return true;
}
- // Match
- // %x = vp.strided.load ;; VPStridedLoad
- // %y = bitcast %x ;; BitCast
- // %y' = inttoptr %y
- // %z = deinterleave %y ;; DI
- if (Factor && isa<BitCastInst, IntToPtrInst>(DI->getOperand(0))) {
- auto *BitCast = cast<Instruction>(DI->getOperand(0));
- if (!BitCast->hasOneUse())
- return false;
-
- Instruction *IntToPtrCast = nullptr;
- if (auto *BC = dyn_cast<BitCastInst>(BitCast->getOperand(0))) {
- IntToPtrCast = BitCast;
- BitCast = BC;
- }
-
- // Match the type is
- // <VF x (factor * elementTy)> bitcast to <(VF * factor) x elementTy>
- Value *BitCastSrc = BitCast->getOperand(0);
- auto *BitCastSrcTy = dyn_cast<VectorType>(BitCastSrc->getType());
- auto *BitCastDstTy = cast<VectorType>(BitCast->getType());
- if (!BitCastSrcTy || (BitCastSrcTy->getElementCount() * Factor !=
- BitCastDstTy->getElementCount()))
- return false;
-
- if (auto *VPStridedLoad = dyn_cast<VPIntrinsic>(BitCast->getOperand(0))) {
- if (VPStridedLoad->getIntrinsicID() !=
- Intrinsic::experimental_vp_strided_load ||
- !VPStridedLoad->hasOneUse())
- return false;
-
- LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI
- << "\n");
-
- if (!TLI->lowerDeinterleaveIntrinsicToStridedLoad(
- VPStridedLoad, DI, Factor, DeInterleaveResults))
- return false;
-
- DeadInsts.push_back(DI);
- if (IntToPtrCast)
- DeadInsts.push_back(IntToPtrCast);
- DeadInsts.push_back(BitCast);
- DeadInsts.push_back(VPStridedLoad);
- return true;
- }
- }
-
LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand(0));
if (!LI || !LI->hasOneUse() || !LI->isSimple())
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2dafbf737512a9..cbc5a60004dcbc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22459,147 +22459,6 @@ bool RISCVTargetLowering::lowerInterleavedScalableStore(
return true;
}
-/// Lower an interleaved vp.strided.load into a vlssegN intrinsic.
-///
-/// E.g. Lower an interleaved vp.strided.load (Factor = 2):
-/// %l = call <vscale x 2 x i16>
-/// @llvm.experimental.vp.strided.load.nxv2i16.p0.i64(ptr %ptr,
-/// %stride,
-/// <all-true-mask>,
-/// i32 %rvl)
-/// %l.cast = bitcast <vscale x 2 x i16> %l to <vscale x 4 x i8>
-/// %dl = tail call { <vscale x 2 x i8>, <vscale x 2 x i8> }
-/// @llvm.vector.deinterleave2.nxv2i8(
-/// <vscale x 4 x i8> %l.cast)
-/// %r0 = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i8> } %dl, 0
-/// %r1 = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i8> } %dl, 1
-///
-/// Into:
-/// %ssl = call { <vscale x 2 x i8>, <vscale x 2 x i8> }
-/// @llvm.riscv.vlseg2.nxv2i8.i64(<vscale x 32 x i8> poison,
-/// <vscale x 32 x i8> poison,
-/// %ptr,
-/// %stride,
-/// i64 %rvl)
-/// %r0 = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i8> } %ssl, 0
-/// %r1 = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i8> } %ssl, 1
-///
-/// NOTE: the deinterleave2 intrinsic and the bitcast instruction won't be
-/// touched and is expected to be removed by the caller
-bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToStridedLoad(
- VPIntrinsic *StridedLoad, IntrinsicInst *DI, unsigned Factor,
- ArrayRef<Value *> DeInterleaveResults) const {
- using namespace llvm::PatternMatch;
- Value *BasePtr, *Stride, *Mask, *EVL;
- if (!match(StridedLoad, m_Intrinsic<Intrinsic::experimental_vp_strided_load>(
- m_Value(BasePtr), m_Value(Stride), m_Value(Mask),
- m_Value(EVL))))
- return false;
-
- [[maybe_unused]] auto *DISrcTy =
- cast<VectorType>(DI->getOperand(0)->getType());
- [[maybe_unused]] auto *LTy = cast<VectorType>(StridedLoad->getType());
- auto &DL = StridedLoad->getModule()->getDataLayout();
- assert(DL.getTypeAllocSizeInBits(DISrcTy) == DL.getTypeAllocSizeInBits(LTy) &&
- "The primitive size of strided load and the source of deinterleave "
- "should be the same.");
- assert(DISrcTy->getElementCount() == LTy->getElementCount() * Factor &&
- "ElementCount of source deinterleave should be equal to the "
- "ElementCount of strided load multiplied by factor.");
-
- auto *ResTy = cast<VectorType>(DeInterleaveResults[0]->getType());
-
- Align Alignment =
- cast<VPIntrinsic>(StridedLoad)->getPointerAlignment().valueOrOne();
- if (!isLegalInterleavedAccessType(
- ResTy, Factor, Alignment,
- BasePtr->getType()->getPointerAddressSpace(), DL))
- return false;
-
- IRBuilder<> Builder(StridedLoad);
- auto *XLenTy =
- Type::getIntNTy(StridedLoad->getContext(), Subtarget.getXLen());
- assert(Stride->getType() == XLenTy &&
- "The type of stride must be the XLEN integer type.");
- EVL = Builder.CreateZExtOrTrunc(EVL, XLenTy);
-
- static const Intrinsic::ID IntrMaskIds[] = {
- Intrinsic::riscv_vlsseg2_mask, Intrinsic::riscv_vlsseg3_mask,
- Intrinsic::riscv_vlsseg4_mask, Intrinsic::riscv_vlsseg5_mask,
- Intrinsic::riscv_vlsseg6_mask, Intrinsic::riscv_vlsseg7_mask,
- Intrinsic::riscv_vlsseg8_mask,
- };
-
- static const Intrinsic::ID IntrIds[] = {
- Intrinsic::riscv_vlsseg2, Intrinsic::riscv_vlsseg3,
- Intrinsic::riscv_vlsseg4, Intrinsic::riscv_vlsseg5,
- Intrinsic::riscv_vlsseg6, Intrinsic::riscv_vlsseg7,
- Intrinsic::riscv_vlsseg8,
- };
-
- unsigned SEW = DL.getTypeSizeInBits(ResTy->getElementType());
- unsigned NumElts = ResTy->getElementCount().getKnownMinValue();
- Type *VecTupTy = TargetExtType::get(
- StridedLoad->getContext(), "riscv.vector.tuple",
- ScalableVectorType::get(Type::getInt8Ty(StridedLoad->getContext()),
- NumElts * SEW / 8),
- Factor);
-
- Value *PoisonVal = PoisonValue::get(VecTupTy);
- SmallVector<Value *, 7> Operands;
- Operands.append({PoisonVal, BasePtr, Stride});
-
- Intrinsic::ID VlssegNID = IntrIds[Factor - 2];
- bool IsMasked = !match(Mask, m_AllOnes());
- if (IsMasked) {
- VlssegNID = IntrMaskIds[Factor - 2];
- Operands.push_back(Mask);
- }
-
- Operands.push_back(EVL);
-
- // Set the tail policy to tail-agnostic, mask-agnostic (tama) for masked
- // intrinsics
- if (IsMasked)
- Operands.push_back(ConstantInt::get(XLenTy, 3));
-
- Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW)));
-
- Function *VlssegNFunc;
- if (IsMasked) {
- VlssegNFunc = Intrinsic::getOrInsertDeclaration(
- StridedLoad->getModule(), VlssegNID,
- {VecTupTy, EVL->getType(), Mask->getType()});
- } else {
- VlssegNFunc = Intrinsic::getOrInsertDeclaration(
- StridedLoad->getModule(), VlssegNID, {VecTupTy, EVL->getType()});
- }
- CallInst *VlssegN = Builder.CreateCall(VlssegNFunc, Operands);
-
- SmallVector<Type *, 8> AggrTypes{Factor, ResTy};
- Value *Return =
- PoisonValue::get(StructType::get(StridedLoad->getContext(), AggrTypes));
- Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
- StridedLoad->getModule(), Intrinsic::riscv_tuple_extract,
- {ResTy, VecTupTy});
- for (unsigned i = 0; i < Factor; ++i) {
- Value *VecExtract =
- Builder.CreateCall(VecExtractFunc, {VlssegN, Builder.getInt32(i)});
- Return = Builder.CreateInsertValue(Return, VecExtract, i);
- }
-
- for (auto [Idx, DIO] : enumerate(DeInterleaveResults)) {
- // We have to create a brand new ExtractValue to replace each
- // of these old ExtractValue instructions.
- Value *NewEV =
- Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
- DIO->replaceAllUsesWith(NewEV);
- }
- DI->replaceAllUsesWith(UndefValue::get(DI->getType()));
-
- return true;
-}
-
MachineInstr *
RISCVTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
MachineBasicBlock::instr_iterator &MBBI,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index b11def055ba7fa..19ed88ca943a14 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -917,10 +917,6 @@ class RISCVTargetLowering : public TargetLowering {
VPIntrinsic *Store, Value *Mask, IntrinsicInst *InterleaveIntrin,
unsigned Factor, ArrayRef<Value *> InterleaveOps) const override;
- bool lowerDeinterleaveIntrinsicToStridedLoad(
- VPIntrinsic *StridedLoad, IntrinsicInst *DI, unsigned Factor,
- ArrayRef<Value *> DeinterleaveRes) const override;
-
bool supportKCFIBundles() const override { return true; }
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-32.ll b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-32.ll
deleted file mode 100644
index 3464376723133b..00000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-32.ll
+++ /dev/null
@@ -1,161 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK %s
-
-define {<vscale x 2 x i32>, <vscale x 2 x i32>} @strided_load_factor2_v2(ptr %ptr, i32 %stride, i32 %rvl) {
-; CHECK-LABEL: strided_load_factor2_v2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1
-; CHECK-NEXT: ret
- %wide.strided.load = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i32(ptr align 4 %ptr, i32 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
- %wide.strided.load.cast = bitcast <vscale x 2 x i64> %wide.strided.load to <vscale x 4 x i32>
- %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.strided.load.cast)
- %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
- %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
- %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
- %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
- ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
-}
-
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @strided_load_factor4_v2(ptr %ptr, i32 %stride, i32 %rvl) {
-; CHECK-LABEL: strided_load_factor4_v2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vlsseg4e32.v v8, (a0), a1
-; CHECK-NEXT: ret
- %wide.strided.load = call <vscale x 2 x i128> @llvm.experimental.vp.strided.load.nxv2i128.p0.i32(ptr align 4 %ptr, i32 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
- %wide.strided.load.cast = bitcast <vscale x 2 x i128> %wide.strided.load to <vscale x 8 x i32>
-
- %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.strided.load.cast)
- %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
- %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
- %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
- %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
- %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
- %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
- %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
- %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
-
- %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
- %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
- %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
- %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
- ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
-}
-
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @strided_load_factor8_v2(ptr %ptr, i32 %stride, i32 %rvl) {
-; CHECK-LABEL: strided_load_factor8_v2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vlsseg8e32.v v8, (a0), a1
-; CHECK-NEXT: ret
- %wide.strided.load = call <vscale x 2 x i256> @llvm.experimental.vp.strided.load.nxv2i256.p0.i32(ptr align 4 %ptr, i32 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
- %wide.strided.load.cast = bitcast <vscale x 2 x i256> %wide.strided.load to <vscale x 16 x i32>
- %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.strided.load.cast)
- %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
- %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
- %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
- %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
- %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
- %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
- %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
- %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
-
- %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
- %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
- %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
- %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
- %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
- %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
- %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
- %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
- %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
- %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
- %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
- %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
-
- %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
- %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
- %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
- %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
- %res4 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3, <vscale x 2 x i32> %t4, 4
- %res5 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res4, <vscale x 2 x i32> %t5, 5
- %res6 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res5, <vscale x 2 x i32> %t6, 6
- %res7 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res6, <vscale x 2 x i32> %t7, 7
- ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res7
-}
-
-define {<vscale x 2 x ptr>, <vscale x 2 x ptr>} @strided_load_factor2_v2_ptr(ptr %ptr, i32 %stride, i32 %rvl) {
-; CHECK-LABEL: strided_load_factor2_v2_ptr:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1
-; CHECK-NEXT: ret
- %wide.strided.load = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 8 %ptr, i32 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
- %wide.strided.load.intcast = bitcast <vscale x 2 x i64> %wide.strided.load to <vscale x 4 x i32>
- %wide.strided.load.cast = inttoptr <vscale x 4 x i32> %wide.strided.load.intcast to <vscale x 4 x ptr>
- %deinterleaved.results = call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %wide.strided.load.cast)
- %t0 = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleaved.results, 0
- %t1 = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleaved.results, 1
- %res0 = insertvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } undef, <vscale x 2 x ptr> %t0, 0
- %res1 = insertvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %res0, <vscale x 2 x ptr> %t1, 1
- ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %res1
-}
-
-; Negative test
-
-define {<vscale x 2 x i32>, <vscale x 2 x i32>} @noalign_info_factor2_v2(ptr %ptr, i32 %stride, i32 %rvl) {
-; CHECK-LABEL: noalign_info_factor2_v2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT: vlse64.v v10, (a0), a1
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT: vnsrl.wx v9, v10, a0
-; CHECK-NEXT: vnsrl.wi v8, v10, 0
-; CHECK-NEXT: ret
- %wide.strided.load = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i32(ptr %ptr, i32 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
- %wide.strided.load.cast = bitcast <vscale x 2 x i64> %wide.strided.load to <vscale x 4 x i32>
- %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.strided.load.cast)
- %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
- %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
- %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
- %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
- ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
-}
-
-declare <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i32(ptr nocapture, i32, <vscale x 4 x i1>, i32)
-
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @not_vlsseg_factor4_v2(ptr %ptr, i32 %stride, i32 %rvl) {
-; CHECK-LABEL: not_vlsseg_factor4_v2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT: vlse64.v v8, (a0), a1
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT: vnsrl.wx v12, v8, a0
-; CHECK-NEXT: vnsrl.wi v14, v8, 0
-; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT: vnsrl.wx v10, v14, a0
-; CHECK-NEXT: vnsrl.wi v8, v14, 0
-; CHECK-NEXT: vnsrl.wx v11, v12, a0
-; CHECK-NEXT: vnsrl.wi v9, v12, 0
-; CHECK-NEXT: ret
- %wide.strided.load = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i32(ptr align 4 %ptr, i32 %stride, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %rvl)
- %wide.strided.load.cast = bitcast <vscale x 4 x i64> %wide.strided.load to <vscale x 8 x i32>
- %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.strided.load.cast)
- %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
- %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
- %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
- %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
- %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
- %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
- %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
- %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
-
- %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
- %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
- %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
- %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
- ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-64.ll b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-64.ll
deleted file mode 100644
index 39d76e61b96b47..00000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-strided-interleave-load-64.ll
+++ /dev/null
@@ -1,171 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK %s
-
-define {<vscale x 2 x i32>, <vscale x 2 x i32>} @strided_load_factor2_v2(ptr %ptr, i64 %stride, i32 %rvl) {
-; CHECK-LABEL: strided_load_factor2_v2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: slli a2, a2, 32
-; CHECK-NEXT: srli a2, a2, 32
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1
-; CHECK-NEXT: ret
- %wide.strided.load = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr align 4 %ptr, i64 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
- %wide.strided.load.cast = bitcast <vscale x 2 x i64> %wide.strided.load to <vscale x 4 x i32>
- %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.strided.load.cast)
- %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
- %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
- %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
- %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
- ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
-}
-
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @strided_load_factor4_v2(ptr %ptr, i64 %stride, i32 %rvl) {
-; CHECK-LABEL: strided_load_factor4_v2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: slli a2, a2, 32
-; CHECK-NEXT: srli a2, a2, 32
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vlsseg4e32.v v8, (a0), a1
-; CHECK-NEXT: ret
- %wide.strided.load = call <vscale x 2 x i128> @llvm.experimental.vp.strided.load.nxv2i128.p0.i64(ptr align 4 %ptr, i64 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
- %wide.strided.load.cast = bitcast <vscale x 2 x i128> %wide.strided.load to <vscale x 8 x i32>
-
- %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.strided.load.cast)
- %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
- %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
- %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
- %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
- %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
- %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
- %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
- %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
-
- %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
- %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
- %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
- %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
- ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
-}
-
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @strided_load_factor8_v2(ptr %ptr, i64 %stride, i32 %rvl) {
-; CHECK-LABEL: strided_load_factor8_v2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: slli a2, a2, 32
-; CHECK-NEXT: srli a2, a2, 32
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vlsseg8e32.v v8, (a0), a1
-; CHECK-NEXT: ret
- %wide.strided.load = call <vscale x 2 x i256> @llvm.experimental.vp.strided.load.nxv2i256.p0.i64(ptr align 4 %ptr, i64 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
- %wide.strided.load.cast = bitcast <vscale x 2 x i256> %wide.strided.load to <vscale x 16 x i32>
- %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.strided.load.cast)
- %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
- %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
- %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
- %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
- %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
- %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
- %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
- %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
-
- %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
- %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
- %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
- %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
- %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
- %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
- %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
- %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
- %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
- %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
- %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
- %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
-
- %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
- %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
- %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
- %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
- %res4 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3, <vscale x 2 x i32> %t4, 4
- %res5 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res4, <vscale x 2 x i32> %t5, 5
- %res6 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res5, <vscale x 2 x i32> %t6, 6
- %res7 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res6, <vscale x 2 x i32> %t7, 7
- ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res7
-}
-
-define {<vscale x 2 x ptr>, <vscale x 2 x ptr>} @strided_load_factor2_v2_ptr(ptr %ptr, i64 %stride, i32 %rvl) {
-; CHECK-LABEL: strided_load_factor2_v2_ptr:
-; CHECK: # %bb.0:
-; CHECK-NEXT: slli a2, a2, 32
-; CHECK-NEXT: srli a2, a2, 32
-; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT: vlsseg2e64.v v8, (a0), a1
-; CHECK-NEXT: ret
- %wide.strided.load = call <vscale x 2 x i128> @llvm.experimental.vp.strided.load.nxv2i128.p0.i64(ptr align 8 %ptr, i64 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
- %wide.strided.load.intcast = bitcast <vscale x 2 x i128> %wide.strided.load to <vscale x 4 x i64>
- %wide.strided.load.cast = inttoptr <vscale x 4 x i64> %wide.strided.load.intcast to <vscale x 4 x ptr>
- %deinterleaved.results = call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %wide.strided.load.cast)
- %t0 = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleaved.results, 0
- %t1 = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleaved.results, 1
- %res0 = insertvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } undef, <vscale x 2 x ptr> %t0, 0
- %res1 = insertvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %res0, <vscale x 2 x ptr> %t1, 1
- ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %res1
-}
-
-; Negative test
-
-define {<vscale x 2 x i32>, <vscale x 2 x i32>} @noalign_info_factor2_v2(ptr %ptr, i64 %stride, i32 %rvl) {
-; CHECK-LABEL: noalign_info_factor2_v2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: slli a2, a2, 32
-; CHECK-NEXT: srli a2, a2, 32
-; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT: vlse64.v v10, (a0), a1
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT: vnsrl.wx v9, v10, a0
-; CHECK-NEXT: vnsrl.wi v8, v10, 0
-; CHECK-NEXT: ret
- %wide.strided.load = call <vscale x 2 x i64> @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr %ptr, i64 %stride, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
- %wide.strided.load.cast = bitcast <vscale x 2 x i64> %wide.strided.load to <vscale x 4 x i32>
- %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.strided.load.cast)
- %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
- %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
- %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
- %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
- ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
-}
-
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @not_vlsseg_factor4_v2(ptr %ptr, i64 %stride, i32 %rvl) {
-; CHECK-LABEL: not_vlsseg_factor4_v2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: slli a2, a2, 32
-; CHECK-NEXT: srli a2, a2, 32
-; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT: vlse64.v v8, (a0), a1
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT: vnsrl.wx v12, v8, a0
-; CHECK-NEXT: vnsrl.wi v14, v8, 0
-; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT: vnsrl.wx v10, v14, a0
-; CHECK-NEXT: vnsrl.wi v8, v14, 0
-; CHECK-NEXT: vnsrl.wx v11, v12, a0
-; CHECK-NEXT: vnsrl.wi v9, v12, 0
-; CHECK-NEXT: ret
- %wide.strided.load = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr align 4 %ptr, i64 %stride, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %rvl)
- %wide.strided.load.cast = bitcast <vscale x 4 x i64> %wide.strided.load to <vscale x 8 x i32>
- %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.strided.load.cast)
- %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
- %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
- %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
- %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
- %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
- %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
- %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
- %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
-
- %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
- %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
- %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
- %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
- ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
-}
>From c15c2fcdc626e111f7b5135000678aa99cf19baf Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Thu, 19 Dec 2024 10:06:52 -0800
Subject: [PATCH 3/4] Address review comments
---
.../CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
index 4fcfefcdfcaa07..a51382b6c31d27 100644
--- a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
@@ -125,7 +125,7 @@ define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
ret void
}
-; Expecting uni-strde store here rather than segmented store.
+; Expecting unit-stride store here rather than segmented store.
define void @store_factor2_const_splat(ptr %dst) {
; RV32-LABEL: store_factor2_const_splat:
; RV32: # %bb.0:
>From e4b64e8a6ffe2dd21f035c269df8b0b56e2f5905 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Tue, 24 Dec 2024 10:38:31 -0800
Subject: [PATCH 4/4] Address review comments
And checks whether the tree is balanced.
---
llvm/lib/CodeGen/InterleavedAccessPass.cpp | 42 ++++---
.../scalable-vectors-interleaved-access.ll | 107 ++++++++++++++++++
2 files changed, 133 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index dc881d0cdd1f41..bc87278531cbca 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -257,7 +257,7 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
// |
// A B C D
//
-// We will get ABCD at the end while the leave operands/results
+// We will get ABCD at the end while the leaf operands/results
// are ACBD, which are also what we initially collected in
// getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
// hooks (e.g. lowerInterleavedScalableLoad) expect ABCD, so we need
@@ -311,6 +311,11 @@ static unsigned getVectorInterleaveFactor(IntrinsicInst *II,
continue;
}
+ // If this is not a perfectly balanced tree, the leaf
+ // result types would be different.
+ if (!Operands.empty() && Op->getType() != Operands.back()->getType())
+ return 0;
+
++Factor;
Operands.push_back(Op);
}
@@ -318,11 +323,11 @@ static unsigned getVectorInterleaveFactor(IntrinsicInst *II,
// Currently we only recognize power-of-two factors.
// FIXME: should we assert here instead?
- if (Factor > 1 && isPowerOf2_32(Factor)) {
- interleaveLeafValues(Operands);
- return Factor;
- }
- return 0;
+ if (Factor <= 1 || !isPowerOf2_32(Factor))
+ return 0;
+
+ interleaveLeafValues(Operands);
+ return Factor;
}
/// Check the interleaved mask
@@ -367,7 +372,7 @@ static unsigned getVectorDeInterleaveFactor(IntrinsicInst *II,
unsigned VisitedIdx = 0;
for (User *Usr : Current->users()) {
- // We're playing safe here and matches only the expression
+ // We're playing safe here and matching only the expression
// consisting of a perfectly balanced binary tree in which all
// intermediate values are only used once.
if (!Usr->hasOneUse() || !isa<ExtractValueInst>(Usr))
@@ -379,10 +384,10 @@ static unsigned getVectorDeInterleaveFactor(IntrinsicInst *II,
return 0;
// The idea is that we don't want to have two extractvalue
- // on the same index. So we XOR (index + 1) onto VisitedIdx
+ // on the same index. So we XOR (1 << index) onto VisitedIdx
// such that if there is any duplication, VisitedIdx will be
// zero.
- VisitedIdx ^= Indices[0] + 1;
+ VisitedIdx ^= (1 << Indices[0]);
if (!VisitedIdx)
return 0;
// We have a legal index. At this point we're either going
@@ -403,15 +408,20 @@ static unsigned getVectorDeInterleaveFactor(IntrinsicInst *II,
m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
EV->user_back()->hasNUses(2)) {
auto *EVUsr = cast<IntrinsicInst>(EV->user_back());
- if (SwapWithLast)
+ if (SwapWithLast && !Queue.empty())
Queue.insert(Queue.end() - 1, EVUsr);
else
Queue.push_back(EVUsr);
continue;
}
+ // If this is not a perfectly balanced tree, the leaf
+ // result types would be different.
+ if (!Results.empty() && EV->getType() != Results.back()->getType())
+ return 0;
+
// Save the leaf value.
- if (SwapWithLast)
+ if (SwapWithLast && !Results.empty())
Results.insert(Results.end() - 1, EV);
else
Results.push_back(EV);
@@ -422,11 +432,11 @@ static unsigned getVectorDeInterleaveFactor(IntrinsicInst *II,
// Currently we only recognize power-of-two factors.
// FIXME: should we assert here instead?
- if (Factor > 1 && isPowerOf2_32(Factor)) {
- interleaveLeafValues(Results);
- return Factor;
- }
- return 0;
+ if (Factor <= 1 || !isPowerOf2_32(Factor))
+ return 0;
+
+ interleaveLeafValues(Results);
+ return Factor;
}
bool InterleavedAccessImpl::lowerInterleavedLoad(
diff --git a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
index a51382b6c31d27..ac254792e167a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
@@ -507,6 +507,113 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
}
+; We should not transform this function because the expression is not a balanced tree.
+define {<vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>} @not_balanced_load_tree(ptr %ptr, i32 %rvl) {
+; RV32-LABEL: not_balanced_load_tree:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v12, (a0)
+; RV32-NEXT: li a0, 32
+; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT: vnsrl.wx v8, v12, a0
+; RV32-NEXT: vnsrl.wi v16, v12, 0
+; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: vnsrl.wi v10, v16, 0
+; RV32-NEXT: vnsrl.wx v11, v16, a0
+; RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; RV32-NEXT: vnsrl.wx v12, v11, a0
+; RV32-NEXT: vnsrl.wi v11, v11, 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: not_balanced_load_tree:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v12, (a0)
+; RV64-NEXT: li a0, 32
+; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; RV64-NEXT: vnsrl.wx v8, v12, a0
+; RV64-NEXT: vnsrl.wi v16, v12, 0
+; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT: vnsrl.wi v10, v16, 0
+; RV64-NEXT: vnsrl.wx v11, v16, a0
+; RV64-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; RV64-NEXT: vnsrl.wx v12, v11, a0
+; RV64-NEXT: vnsrl.wi v11, v11, 0
+; RV64-NEXT: ret
+ %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %t0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %d1.1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 2 x i32> %d1.1)
+ %t2 = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } %d2, 0
+ %t3 = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } undef, <vscale x 4 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res1, <vscale x 1 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res2, <vscale x 1 x i32> %t3, 3
+ ret { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res3
+}
+
+define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32> %v1, <vscale x 4 x i32> %v2, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: not_balanced_store_tree:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; RV32-NEXT: vwaddu.vv v12, v8, v8
+; RV32-NEXT: li a2, -1
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: vwmaccu.vx v12, a2, v8
+; RV32-NEXT: srli a3, a3, 3
+; RV32-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vx v8, v12, a3
+; RV32-NEXT: add a4, a3, a3
+; RV32-NEXT: vsetvli zero, a4, e32, m1, ta, ma
+; RV32-NEXT: vslideup.vx v12, v8, a3
+; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV32-NEXT: vwaddu.vv v14, v12, v9
+; RV32-NEXT: vwmaccu.vx v14, a2, v9
+; RV32-NEXT: vsetvli a3, zero, e32, m2, ta, ma
+; RV32-NEXT: vwaddu.vv v16, v14, v10
+; RV32-NEXT: vwmaccu.vx v16, a2, v10
+; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v16, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: not_balanced_store_tree:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; RV64-NEXT: vwaddu.vv v12, v8, v8
+; RV64-NEXT: li a2, -1
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: vwmaccu.vx v12, a2, v8
+; RV64-NEXT: srli a3, a3, 3
+; RV64-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vx v8, v12, a3
+; RV64-NEXT: add a4, a3, a3
+; RV64-NEXT: vsetvli zero, a4, e32, m1, ta, ma
+; RV64-NEXT: vslideup.vx v12, v8, a3
+; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV64-NEXT: vwaddu.vv v14, v12, v9
+; RV64-NEXT: vwmaccu.vx v14, a2, v9
+; RV64-NEXT: vsetvli a3, zero, e32, m2, ta, ma
+; RV64-NEXT: vwaddu.vv v16, v14, v10
+; RV64-NEXT: vwmaccu.vx v16, a2, v10
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v16, (a0)
+; RV64-NEXT: ret
+ %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+ %interleaved.vec1 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %v1)
+ %interleaved.vec2 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 4 x i32> %interleaved.vec1, <vscale x 4 x i32> %v2)
+ call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec2, ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+ ret void
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
More information about the llvm-commits
mailing list