[llvm] 005b23b - [IA][RISCV] Support VP loads/stores in InterleavedAccessPass (#120490)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 4 11:07:41 PST 2025
Author: Min-Yih Hsu
Date: 2025-02-04T11:07:34-08:00
New Revision: 005b23bb3bf0b943db3a6d12b01b2c01789341b8
URL: https://github.com/llvm/llvm-project/commit/005b23bb3bf0b943db3a6d12b01b2c01789341b8
DIFF: https://github.com/llvm/llvm-project/commit/005b23bb3bf0b943db3a6d12b01b2c01789341b8.diff
LOG: [IA][RISCV] Support VP loads/stores in InterleavedAccessPass (#120490)
Teach InterleavedAccessPass to recognize the following patterns:
- vp.store an interleaved scalable vector
- Deinterleaving a scalable vector loaded from vp.load
Upon recognizing these patterns, IA will collect the interleaved /
deinterleaved operands and delegate them over to their respective
newly-added TLI hooks.
For RISC-V, these patterns are lowered into segmented loads/stores
Right now we only recognized power-of-two (de)interleave cases, in which
(de)interleave4/8 are synthesized from a tree of (de)interleave2.
---------
Co-authored-by: Nikolay Panchenko <nicholas.panchenko at gmail.com>
Added:
llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/InterleavedAccessPass.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.h
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 04ee24c0916e5f5..bbecc7a6ddaee79 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -94,6 +94,7 @@ class TargetRegisterClass;
class TargetRegisterInfo;
class TargetTransformInfo;
class Value;
+class VPIntrinsic;
namespace Sched {
@@ -3156,6 +3157,30 @@ class TargetLoweringBase {
return false;
}
+ /// Lower an interleaved load to target specific intrinsics. Return
+ /// true on success.
+ ///
+ /// \p Load is a vp.load instruction.
+ /// \p Mask is a mask value
+ /// \p DeinterleaveRes is a list of deinterleaved results.
+ virtual bool
+ lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask,
+ ArrayRef<Value *> DeinterleaveRes) const {
+ return false;
+ }
+
+ /// Lower an interleaved store to target specific intrinsics. Return
+ /// true on success.
+ ///
+ /// \p Store is the vp.store instruction.
+ /// \p Mask is a mask value
+ /// \p InterleaveOps is a list of values being interleaved.
+ virtual bool
+ lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask,
+ ArrayRef<Value *> InterleaveOps) const {
+ return false;
+ }
+
/// Lower a deinterleave intrinsic to a target specific load intrinsic.
/// Return true on success. Currently only supports
/// llvm.vector.deinterleave2
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 3f6a69ecb7d729a..3261f2858b2368c 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -630,11 +630,37 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
return true;
}
+// Return the corresponded deinterleaved mask, or nullptr if there is no valid
+// mask.
+static Value *getMask(Value *WideMask, unsigned Factor,
+ VectorType *LeafValueTy) {
+ using namespace llvm::PatternMatch;
+ if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
+ SmallVector<Value *, 8> Operands;
+ SmallVector<Instruction *, 8> DeadInsts;
+ if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) {
+ assert(!Operands.empty());
+ if (Operands.size() == Factor && llvm::all_equal(Operands))
+ return Operands[0];
+ }
+ }
+
+ if (match(WideMask, m_AllOnes())) {
+ // Scale the vector length of all-ones mask.
+ ElementCount OrigEC =
+ cast<VectorType>(WideMask->getType())->getElementCount();
+ assert(OrigEC.getKnownMinValue() % Factor == 0);
+ return ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor),
+ cast<Constant>(WideMask)->getSplatValue());
+ }
+
+ return nullptr;
+}
+
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
- LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand(0));
-
- if (!LI || !LI->hasOneUse() || !LI->isSimple())
+ Value *LoadedVal = DI->getOperand(0);
+ if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
return false;
SmallVector<Value *, 8> DeinterleaveValues;
@@ -643,16 +669,43 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
DeinterleaveDeadInsts))
return false;
- LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI
- << " with factor = " << DeinterleaveValues.size() << "\n");
+ const unsigned Factor = DeinterleaveValues.size();
- // Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
- return false;
+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
+ if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
+ return false;
+ // Check mask operand. Handle both all-true and interleaved mask.
+ Value *WideMask = VPLoad->getOperand(1);
+ Value *Mask = getMask(WideMask, Factor,
+ cast<VectorType>(DeinterleaveValues[0]->getType()));
+ if (!Mask)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic "
+ << *DI << " and factor = " << Factor << "\n");
+
+ // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
+ // TLI function to emit target-specific interleaved instruction.
+ if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask,
+ DeinterleaveValues))
+ return false;
+
+ } else {
+ auto *LI = cast<LoadInst>(LoadedVal);
+ if (!LI->isSimple())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
+ << " and factor = " << Factor << "\n");
+
+ // Try and match this with target specific intrinsics.
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
+ return false;
+ }
DeadInsts.insert(DeinterleaveDeadInsts.begin(), DeinterleaveDeadInsts.end());
// We now have a target-specific load, so delete the old one.
- DeadInsts.insert(LI);
+ DeadInsts.insert(cast<Instruction>(LoadedVal));
return true;
}
@@ -660,10 +713,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
IntrinsicInst *II, SmallSetVector<Instruction *, 32> &DeadInsts) {
if (!II->hasOneUse())
return false;
-
- StoreInst *SI = dyn_cast<StoreInst>(*(II->users().begin()));
-
- if (!SI || !SI->isSimple())
+ Value *StoredBy = II->user_back();
+ if (!isa<StoreInst, VPIntrinsic>(StoredBy))
return false;
SmallVector<Value *, 8> InterleaveValues;
@@ -671,15 +722,41 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
if (!getVectorInterleaveFactor(II, InterleaveValues, InterleaveDeadInsts))
return false;
- LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II
- << " with factor = " << InterleaveValues.size() << "\n");
+ const unsigned Factor = InterleaveValues.size();
- // Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues))
- return false;
+ if (auto *VPStore = dyn_cast<VPIntrinsic>(StoredBy)) {
+ if (VPStore->getIntrinsicID() != Intrinsic::vp_store)
+ return false;
+
+ Value *WideMask = VPStore->getOperand(2);
+ Value *Mask = getMask(WideMask, Factor,
+ cast<VectorType>(InterleaveValues[0]->getType()));
+ if (!Mask)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic "
+ << *II << " and factor = " << Factor << "\n");
+
+ // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
+ // TLI function to emit target-specific interleaved instruction.
+ if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask,
+ InterleaveValues))
+ return false;
+ } else {
+ auto *SI = cast<StoreInst>(StoredBy);
+ if (!SI->isSimple())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II
+ << " and factor = " << Factor << "\n");
+
+ // Try and match this with target specific intrinsics.
+ if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues))
+ return false;
+ }
// We now have a target-specific store, so delete the old one.
- DeadInsts.insert(SI);
+ DeadInsts.insert(cast<Instruction>(StoredBy));
DeadInsts.insert(InterleaveDeadInsts.begin(), InterleaveDeadInsts.end());
return true;
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7c3b58389da28ee..2d2213b420f5a46 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22,6 +22,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -22773,6 +22774,231 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
return true;
}
+static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
+ assert(N);
+ if (N == 1)
+ return true;
+
+ if (isPowerOf2_32(N)) {
+ KnownBits KB = llvm::computeKnownBits(V, DL);
+ return KB.countMinTrailingZeros() >= Log2_32(N);
+ }
+
+ using namespace PatternMatch;
+ // Right now we're only recognizing the simplest pattern.
+ uint64_t C;
+ return match(V, m_c_Mul(m_Value(), m_ConstantInt(C))) && C && C % N == 0;
+}
+
+/// Lower an interleaved vp.load into a vlsegN intrinsic.
+///
+/// E.g. Lower an interleaved vp.load (Factor = 2):
+/// %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr,
+/// %mask,
+/// i32 %wide.rvl)
+/// %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> }
+/// @llvm.vector.deinterleave2.nxv64i8(
+/// <vscale x 64 x i8> %l)
+/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0
+/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1
+///
+/// Into:
+/// %rvl = udiv %wide.rvl, 2
+/// %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> }
+/// @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef,
+/// <vscale x 32 x i8> undef,
+/// ptr %ptr,
+/// %mask,
+/// i64 %rvl,
+/// i64 1)
+/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0
+/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1
+///
+/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
+/// removed by the caller
+/// TODO: We probably can loosen the dependency on matching extractvalue when
+/// dealing with factor of 2 (extractvalue is still required for most of other
+/// factors though).
+bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
+ VPIntrinsic *Load, Value *Mask,
+ ArrayRef<Value *> DeinterleaveResults) const {
+ assert(Mask && "Expect a valid mask");
+ assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
+ "Unexpected intrinsic");
+
+ const unsigned Factor = DeinterleaveResults.size();
+
+ auto *WideVTy = dyn_cast<ScalableVectorType>(Load->getType());
+ // TODO: Support fixed vectors.
+ if (!WideVTy)
+ return false;
+
+ unsigned WideNumElements = WideVTy->getElementCount().getKnownMinValue();
+ assert(WideNumElements % Factor == 0 &&
+ "ElementCount of a wide load must be divisible by interleave factor");
+ auto *VTy =
+ VectorType::get(WideVTy->getScalarType(), WideNumElements / Factor,
+ WideVTy->isScalableTy());
+ auto &DL = Load->getModule()->getDataLayout();
+ Align Alignment = Load->getParamAlign(0).value_or(
+ DL.getABITypeAlign(WideVTy->getElementType()));
+ if (!isLegalInterleavedAccessType(
+ VTy, Factor, Alignment,
+ Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
+ return false;
+
+ IRBuilder<> Builder(Load);
+ Value *WideEVL = Load->getArgOperand(2);
+ // Conservatively check if EVL is a multiple of factor, otherwise some
+ // (trailing) elements might be lost after the transformation.
+ if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
+ return false;
+
+ auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+ Value *EVL = Builder.CreateZExt(
+ Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
+ XLenTy);
+
+ static const Intrinsic::ID IntrMaskIds[] = {
+ Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+ Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+ Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+ Intrinsic::riscv_vlseg8_mask,
+ };
+
+ unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
+ unsigned NumElts = VTy->getElementCount().getKnownMinValue();
+ Type *VecTupTy = TargetExtType::get(
+ Load->getContext(), "riscv.vector.tuple",
+ ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
+ NumElts * SEW / 8),
+ Factor);
+
+ Value *PoisonVal = PoisonValue::get(VecTupTy);
+
+ Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
+ Load->getModule(), IntrMaskIds[Factor - 2],
+ {VecTupTy, Mask->getType(), EVL->getType()});
+
+ Value *Operands[] = {
+ PoisonVal,
+ Load->getArgOperand(0),
+ Mask,
+ EVL,
+ ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC),
+ ConstantInt::get(XLenTy, Log2_64(SEW))};
+
+ CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
+
+ SmallVector<Type *, 8> AggrTypes{Factor, VTy};
+ Value *Return =
+ PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
+ Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
+ Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
+ for (unsigned i = 0; i < Factor; ++i) {
+ Value *VecExtract =
+ Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
+ Return = Builder.CreateInsertValue(Return, VecExtract, i);
+ }
+
+ for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
+ // We have to create a brand new ExtractValue to replace each
+ // of these old ExtractValue instructions.
+ Value *NewEV =
+ Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
+ DIO->replaceAllUsesWith(NewEV);
+ }
+
+ return true;
+}
+
+/// Lower an interleaved vp.store into a vssegN intrinsic.
+///
+/// E.g. Lower an interleaved vp.store (Factor = 2):
+///
+/// %is = tail call <vscale x 64 x i8>
+/// @llvm.vector.interleave2.nxv64i8(
+/// <vscale x 32 x i8> %load0,
+/// <vscale x 32 x i8> %load1
+/// %wide.rvl = shl nuw nsw i32 %rvl, 1
+/// tail call void @llvm.vp.store.nxv64i8.p0(
+/// <vscale x 64 x i8> %is, ptr %ptr,
+/// %mask,
+/// i32 %wide.rvl)
+///
+/// Into:
+/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64(
+/// <vscale x 32 x i8> %load1,
+/// <vscale x 32 x i8> %load2, ptr %ptr,
+/// %mask,
+/// i64 %rvl)
+bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
+ VPIntrinsic *Store, Value *Mask,
+ ArrayRef<Value *> InterleaveOperands) const {
+ assert(Mask && "Expect a valid mask");
+ assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
+ "Unexpected intrinsic");
+
+ const unsigned Factor = InterleaveOperands.size();
+
+ auto *VTy = dyn_cast<ScalableVectorType>(InterleaveOperands[0]->getType());
+ // TODO: Support fixed vectors.
+ if (!VTy)
+ return false;
+
+ const DataLayout &DL = Store->getDataLayout();
+ Align Alignment = Store->getParamAlign(1).value_or(
+ DL.getABITypeAlign(VTy->getElementType()));
+ if (!isLegalInterleavedAccessType(
+ VTy, Factor, Alignment,
+ Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
+ return false;
+
+ IRBuilder<> Builder(Store);
+ Value *WideEVL = Store->getArgOperand(3);
+ // Conservatively check if EVL is a multiple of factor, otherwise some
+ // (trailing) elements might be lost after the transformation.
+ if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor))
+ return false;
+
+ auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
+ Value *EVL = Builder.CreateZExt(
+ Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
+ XLenTy);
+
+ static const Intrinsic::ID IntrMaskIds[] = {
+ Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+ Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+ Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+ Intrinsic::riscv_vsseg8_mask,
+ };
+
+ unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
+ unsigned NumElts = VTy->getElementCount().getKnownMinValue();
+ Type *VecTupTy = TargetExtType::get(
+ Store->getContext(), "riscv.vector.tuple",
+ ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
+ NumElts * SEW / 8),
+ Factor);
+
+ Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
+ Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy});
+ Value *StoredVal = PoisonValue::get(VecTupTy);
+ for (unsigned i = 0; i < Factor; ++i)
+ StoredVal = Builder.CreateCall(
+ VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});
+
+ Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
+ Store->getModule(), IntrMaskIds[Factor - 2],
+ {VecTupTy, Mask->getType(), EVL->getType()});
+
+ Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL,
+ ConstantInt::get(XLenTy, Log2_64(SEW))};
+
+ Builder.CreateCall(VssegNFunc, Operands);
+ return true;
+}
+
MachineInstr *
RISCVTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
MachineBasicBlock::instr_iterator &MBBI,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 77605a3076a80a1..e9dd8ff96fa37b4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -910,6 +910,14 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleaveIntrinsicToStore(
StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
+ bool lowerDeinterleavedIntrinsicToVPLoad(
+ VPIntrinsic *Load, Value *Mask,
+ ArrayRef<Value *> DeinterleaveRes) const override;
+
+ bool lowerInterleavedIntrinsicToVPStore(
+ VPIntrinsic *Store, Value *Mask,
+ ArrayRef<Value *> InterleaveOps) const override;
+
bool supportKCFIBundles() const override { return true; }
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
new file mode 100644
index 000000000000000..e481891dfd52fc0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -0,0 +1,816 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV32 %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV64 %s
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor2_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg2e32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: load_factor2_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 33
+; RV64-NEXT: srli a1, a1, 33
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg2e32.v v8, (a0)
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 2
+ %wide.masked.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
+ %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor4_v2(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor4_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg4e32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: load_factor4_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 34
+; RV64-NEXT: srli a1, a1, 34
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg4e32.v v8, (a0)
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 4
+ %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor8_v2(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor8_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: srli a1, a1, 3
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg8e32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: load_factor8_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 35
+; RV64-NEXT: srli a1, a1, 35
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg8e32.v v8, (a0)
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 8
+ %wide.masked.load = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %rvl)
+ %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.masked.load)
+ %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
+ %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
+ %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
+ %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
+ %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
+ %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
+ %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
+
+ %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
+ %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
+ %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
+ %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
+ %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
+ %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
+ %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
+ %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ %res4 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3, <vscale x 2 x i32> %t4, 4
+ %res5 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res4, <vscale x 2 x i32> %t5, 5
+ %res6 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res5, <vscale x 2 x i32> %t6, 6
+ %res7 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res6, <vscale x 2 x i32> %t7, 7
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res7
+}
+
+define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
+; RV32-LABEL: store_factor2_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vsseg2e32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: store_factor2_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 33
+; RV64-NEXT: srli a1, a1, 33
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vsseg2e32.v v8, (a0)
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 2
+ %interleaved.vec = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
+ call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %rvl)
+ ret void
+}
+
+define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
+; RV32-LABEL: store_factor4_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v10, v8
+; RV32-NEXT: vmv1r.v v11, v9
+; RV32-NEXT: vsseg4e32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: store_factor4_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 35
+; RV64-NEXT: srli a1, a1, 34
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v10, v8
+; RV64-NEXT: vmv1r.v v11, v9
+; RV64-NEXT: vsseg4e32.v v8, (a0)
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 8
+ %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+ %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
+ %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
+ call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec2, ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
+ ret void
+}
+
+define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
+; RV32-LABEL: store_factor8_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: srli a1, a1, 3
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v10, v8
+; RV32-NEXT: vmv1r.v v11, v9
+; RV32-NEXT: vmv1r.v v12, v8
+; RV32-NEXT: vmv1r.v v13, v9
+; RV32-NEXT: vmv1r.v v14, v8
+; RV32-NEXT: vmv1r.v v15, v9
+; RV32-NEXT: vsseg8e32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: store_factor8_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 35
+; RV64-NEXT: srli a1, a1, 35
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v10, v8
+; RV64-NEXT: vmv1r.v v11, v9
+; RV64-NEXT: vmv1r.v v12, v8
+; RV64-NEXT: vmv1r.v v13, v9
+; RV64-NEXT: vmv1r.v v14, v8
+; RV64-NEXT: vmv1r.v v15, v9
+; RV64-NEXT: vsseg8e32.v v8, (a0)
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 8
+ %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+ %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+ %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
+ %interleaved.vec3 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
+ %interleaved.vec4 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
+ %interleaved.vec5 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec3, <vscale x 2 x i32> %interleaved.vec4)
+ %interleaved.vec6 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %interleaved.vec2, <vscale x 4 x i32> %interleaved.vec5)
+ call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec6, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
+ ret void
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor2_v2(<vscale x 2 x i1> %mask, ptr %ptr, i32 %evl) {
+; RV32-LABEL: masked_load_factor2_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg2e32.v v8, (a0), v0.t
+; RV32-NEXT: ret
+;
+; RV64-LABEL: masked_load_factor2_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 33
+; RV64-NEXT: srli a1, a1, 33
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg2e32.v v8, (a0), v0.t
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 2
+ %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+ %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor4_v2(<vscale x 2 x i1> %mask, ptr %ptr, i32 %evl) {
+; RV32-LABEL: masked_load_factor4_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg4e32.v v8, (a0), v0.t
+; RV32-NEXT: ret
+;
+; RV64-LABEL: masked_load_factor4_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 34
+; RV64-NEXT: srli a1, a1, 34
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg4e32.v v8, (a0), v0.t
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 4
+ %interleaved.mask0 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %interleaved.mask1 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %interleaved.mask2 = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %interleaved.mask0, <vscale x 4 x i1> %interleaved.mask1)
+ %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %interleaved.mask2, i32 %rvl)
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+define void @masked_store_factor2_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
+; RV32-LABEL: masked_store_factor2_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv1r.v v9, v8
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vsseg2e32.v v8, (a0), v0.t
+; RV32-NEXT: ret
+;
+; RV64-LABEL: masked_store_factor2_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 33
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv1r.v v9, v8
+; RV64-NEXT: srli a1, a1, 33
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vsseg2e32.v v8, (a0), v0.t
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 2
+ %interleaved.mask = tail call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
+ %interleaved.vec = tail call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+ tail call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> %interleaved.mask, i32 %rvl)
+ ret void
+}
+
+define void @masked_load_store_factor2_v2_shared_mask(<vscale x 2 x i1> %mask, ptr %ptr, i32 %evl) {
+; RV32-LABEL: masked_load_store_factor2_v2_shared_mask:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT: vlseg2e32.v v8, (a0), v0.t
+; RV32-NEXT: vsseg2e32.v v8, (a0), v0.t
+; RV32-NEXT: ret
+;
+; RV64-LABEL: masked_load_store_factor2_v2_shared_mask:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 33
+; RV64-NEXT: srli a1, a1, 33
+; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT: vlseg2e32.v v8, (a0), v0.t
+; RV64-NEXT: vsseg2e32.v v8, (a0), v0.t
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 2
+ %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+ %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %interleaved.vec = tail call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %t0, <vscale x 2 x i32> %t1)
+ tail call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+ ret void
+}
+
+define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %mask, ptr %ptr, i32 %evl) {
+; RV32-LABEL: masked_load_store_factor2_v2_shared_mask_extract:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT: vmv1r.v v8, v0
+; RV32-NEXT: vmv.v.i v9, 0
+; RV32-NEXT: li a2, -1
+; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmv.v.i v10, 0
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT: vmerge.vim v11, v9, 1, v0
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: vwaddu.vv v12, v11, v11
+; RV32-NEXT: vwmaccu.vx v12, a2, v11
+; RV32-NEXT: vmsne.vi v0, v12, 0
+; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT: vslidedown.vx v11, v12, a3
+; RV32-NEXT: vmerge.vim v10, v10, 1, v0
+; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT: vmsne.vi v0, v11, 0
+; RV32-NEXT: add a2, a3, a3
+; RV32-NEXT: vmerge.vim v9, v9, 1, v0
+; RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma
+; RV32-NEXT: vslideup.vx v10, v9, a3
+; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmsne.vi v0, v10, 0
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma
+; RV32-NEXT: vle32.v v10, (a0), v0.t
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV32-NEXT: vnsrl.wx v13, v10, a1
+; RV32-NEXT: vmv.x.s a1, v10
+; RV32-NEXT: vnsrl.wi v12, v10, 0
+; RV32-NEXT: srli a2, a2, 1
+; RV32-NEXT: vmv1r.v v0, v8
+; RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; RV32-NEXT: vsseg2e32.v v12, (a0), v0.t
+; RV32-NEXT: mv a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: masked_load_store_factor2_v2_shared_mask_extract:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT: vmv1r.v v8, v0
+; RV64-NEXT: vmv.v.i v9, 0
+; RV64-NEXT: li a2, -1
+; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmv.v.i v10, 0
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a4, a1, 33
+; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-NEXT: vmerge.vim v11, v9, 1, v0
+; RV64-NEXT: srli a3, a3, 2
+; RV64-NEXT: vwaddu.vv v12, v11, v11
+; RV64-NEXT: vwmaccu.vx v12, a2, v11
+; RV64-NEXT: vmsne.vi v0, v12, 0
+; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT: vslidedown.vx v11, v12, a3
+; RV64-NEXT: vmerge.vim v10, v10, 1, v0
+; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-NEXT: vmsne.vi v0, v11, 0
+; RV64-NEXT: add a1, a3, a3
+; RV64-NEXT: vmerge.vim v9, v9, 1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vslideup.vx v10, v9, a3
+; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmsne.vi v0, v10, 0
+; RV64-NEXT: srli a1, a4, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; RV64-NEXT: vle32.v v10, (a0), v0.t
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; RV64-NEXT: vnsrl.wx v13, v10, a1
+; RV64-NEXT: vmv.x.s a1, v10
+; RV64-NEXT: vnsrl.wi v12, v10, 0
+; RV64-NEXT: srli a4, a4, 33
+; RV64-NEXT: vmv1r.v v0, v8
+; RV64-NEXT: vsetvli zero, a4, e32, m1, ta, ma
+; RV64-NEXT: vsseg2e32.v v12, (a0), v0.t
+; RV64-NEXT: mv a0, a1
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 2
+ %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+ %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+ %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %r0 = extractelement <vscale x 4 x i32> %wide.masked.load, i32 0
+ %interleaved.vec = tail call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %t0, <vscale x 2 x i32> %t1)
+ tail call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+ ret i32 %r0
+}
+
+define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
+; RV32-LABEL: masked_store_factor4_v2:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v10, v8
+; RV32-NEXT: vmv1r.v v11, v9
+; RV32-NEXT: vsseg4e32.v v8, (a0), v0.t
+; RV32-NEXT: ret
+;
+; RV64-LABEL: masked_store_factor4_v2:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 34
+; RV64-NEXT: srli a1, a1, 34
+; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v10, v8
+; RV64-NEXT: vmv1r.v v11, v9
+; RV64-NEXT: vsseg4e32.v v8, (a0), v0.t
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 4
+ %interleaved.mask0 = call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
+ %interleaved.mask1 = call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
+ %interleaved.mask2 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %interleaved.mask0, <vscale x 2 x i1> %interleaved.mask1)
+ %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+ %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
+ %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
+ call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec2, ptr %ptr, <vscale x 4 x i1> %interleaved.mask2, i32 %rvl)
+ ret void
+}
+
+; Negative tests
+
+; We should not transform this function because the deinterleave tree is not in a desired form.
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @incorrect_extract_value_index(ptr %ptr, i32 %evl) {
+; RV32-LABEL: incorrect_extract_value_index:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: li a0, 32
+; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT: vnsrl.wi v12, v8, 0
+; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: vnsrl.wx v9, v12, a0
+; RV32-NEXT: vnsrl.wi v8, v12, 0
+; RV32-NEXT: vmv.v.v v10, v9
+; RV32-NEXT: vmv.v.v v11, v9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: incorrect_extract_value_index:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 34
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: li a0, 32
+; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; RV64-NEXT: vnsrl.wi v12, v8, 0
+; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT: vnsrl.wx v9, v12, a0
+; RV64-NEXT: vnsrl.wi v8, v12, 0
+; RV64-NEXT: vmv.v.v v10, v9
+; RV64-NEXT: vmv.v.v v11, v9
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 4
+ %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+; We should not transform this function because the expression is not a balanced tree.
+define {<vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>} @not_balanced_load_tree(ptr %ptr, i32 %evl) {
+; RV32-LABEL: not_balanced_load_tree:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v12, (a0)
+; RV32-NEXT: li a0, 32
+; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT: vnsrl.wx v8, v12, a0
+; RV32-NEXT: vnsrl.wi v16, v12, 0
+; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: vnsrl.wi v10, v16, 0
+; RV32-NEXT: vnsrl.wx v11, v16, a0
+; RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; RV32-NEXT: vnsrl.wx v12, v11, a0
+; RV32-NEXT: vnsrl.wi v11, v11, 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: not_balanced_load_tree:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 34
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v12, (a0)
+; RV64-NEXT: li a0, 32
+; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; RV64-NEXT: vnsrl.wx v8, v12, a0
+; RV64-NEXT: vnsrl.wi v16, v12, 0
+; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT: vnsrl.wi v10, v16, 0
+; RV64-NEXT: vnsrl.wx v11, v16, a0
+; RV64-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; RV64-NEXT: vnsrl.wx v12, v11, a0
+; RV64-NEXT: vnsrl.wi v11, v11, 0
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 4
+ %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %t0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %d1.1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 2 x i32> %d1.1)
+ %t2 = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } %d2, 0
+ %t3 = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 4 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res1, <vscale x 1 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res2, <vscale x 1 x i32> %t3, 3
+ ret { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res3
+}
+
+define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32> %v1, <vscale x 4 x i32> %v2, ptr %ptr, i32 %evl) {
+; RV32-LABEL: not_balanced_store_tree:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; RV32-NEXT: vwaddu.vv v12, v8, v8
+; RV32-NEXT: li a2, -1
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: vwmaccu.vx v12, a2, v8
+; RV32-NEXT: srli a3, a3, 3
+; RV32-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vx v8, v12, a3
+; RV32-NEXT: add a4, a3, a3
+; RV32-NEXT: vsetvli zero, a4, e32, m1, ta, ma
+; RV32-NEXT: vslideup.vx v12, v8, a3
+; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV32-NEXT: vwaddu.vv v14, v12, v9
+; RV32-NEXT: vwmaccu.vx v14, a2, v9
+; RV32-NEXT: vsetvli a3, zero, e32, m2, ta, ma
+; RV32-NEXT: vwaddu.vv v16, v14, v10
+; RV32-NEXT: vwmaccu.vx v16, a2, v10
+; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT: vse32.v v16, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: not_balanced_store_tree:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; RV64-NEXT: vwaddu.vv v12, v8, v8
+; RV64-NEXT: li a2, -1
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a1, a1, 34
+; RV64-NEXT: vwmaccu.vx v12, a2, v8
+; RV64-NEXT: srli a3, a3, 3
+; RV64-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vx v8, v12, a3
+; RV64-NEXT: add a4, a3, a3
+; RV64-NEXT: vsetvli zero, a4, e32, m1, ta, ma
+; RV64-NEXT: vslideup.vx v12, v8, a3
+; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; RV64-NEXT: vwaddu.vv v14, v12, v9
+; RV64-NEXT: vwmaccu.vx v14, a2, v9
+; RV64-NEXT: vsetvli a3, zero, e32, m2, ta, ma
+; RV64-NEXT: vwaddu.vv v16, v14, v10
+; RV64-NEXT: vwmaccu.vx v16, a2, v10
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT: vse32.v v16, (a0)
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 4
+ %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+ %interleaved.vec1 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %v1)
+ %interleaved.vec2 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 4 x i32> %interleaved.vec1, <vscale x 4 x i32> %v2)
+ call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec2, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
+ ret void
+}
+
+; We only support scalable vectors for now.
+define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %ptr, i32 %evl) {
+; RV32-LABEL: not_scalable_vectors:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: li a0, 32
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vnsrl.wx v12, v8, a0
+; RV32-NEXT: vnsrl.wi v11, v8, 0
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vnsrl.wx v10, v11, a0
+; RV32-NEXT: vnsrl.wi v8, v11, 0
+; RV32-NEXT: vnsrl.wx v11, v12, a0
+; RV32-NEXT: vnsrl.wi v9, v12, 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: not_scalable_vectors:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a1, a1, 34
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: li a0, 32
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vnsrl.wx v12, v8, a0
+; RV64-NEXT: vnsrl.wi v11, v8, 0
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vnsrl.wx v10, v11, a0
+; RV64-NEXT: vnsrl.wi v8, v11, 0
+; RV64-NEXT: vnsrl.wx v11, v12, a0
+; RV64-NEXT: vnsrl.wi v9, v12, 0
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 4
+ %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %rvl)
+ %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load)
+ %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0
+ %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1
+ %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0)
+ %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0
+ %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1
+ %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1)
+ %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0
+ %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1
+
+ %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0
+ %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1
+ %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2
+ %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3
+ ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1, ptr %ptr, i32 %evl) {
+; RV32-LABEL: not_same_mask:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT: vmv1r.v v9, v0
+; RV32-NEXT: vmv1r.v v0, v8
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: li a2, -1
+; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmv.v.i v10, 0
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT: vmerge.vim v11, v8, 1, v0
+; RV32-NEXT: vmv1r.v v0, v9
+; RV32-NEXT: vmerge.vim v9, v8, 1, v0
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: vwaddu.vv v12, v9, v11
+; RV32-NEXT: vwmaccu.vx v12, a2, v11
+; RV32-NEXT: vmsne.vi v0, v12, 0
+; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT: vslidedown.vx v9, v12, a3
+; RV32-NEXT: vmerge.vim v10, v10, 1, v0
+; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT: vmsne.vi v0, v9, 0
+; RV32-NEXT: add a2, a3, a3
+; RV32-NEXT: vmerge.vim v8, v8, 1, v0
+; RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma
+; RV32-NEXT: vslideup.vx v10, v8, a3
+; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmsne.vi v0, v10, 0
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; RV32-NEXT: vle32.v v10, (a0), v0.t
+; RV32-NEXT: li a0, 32
+; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: vnsrl.wx v9, v10, a0
+; RV32-NEXT: vnsrl.wi v8, v10, 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: not_same_mask:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT: vmv1r.v v9, v0
+; RV64-NEXT: vmv1r.v v0, v8
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: li a2, -1
+; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmv.v.i v10, 0
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: slli a1, a1, 33
+; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
+; RV64-NEXT: vmerge.vim v11, v8, 1, v0
+; RV64-NEXT: vmv1r.v v0, v9
+; RV64-NEXT: vmerge.vim v9, v8, 1, v0
+; RV64-NEXT: srli a3, a3, 2
+; RV64-NEXT: vwaddu.vv v12, v9, v11
+; RV64-NEXT: vwmaccu.vx v12, a2, v11
+; RV64-NEXT: vmsne.vi v0, v12, 0
+; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT: vslidedown.vx v9, v12, a3
+; RV64-NEXT: vmerge.vim v10, v10, 1, v0
+; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT: vmsne.vi v0, v9, 0
+; RV64-NEXT: add a2, a3, a3
+; RV64-NEXT: vmerge.vim v8, v8, 1, v0
+; RV64-NEXT: vsetvli zero, a2, e8, mf2, ta, ma
+; RV64-NEXT: vslideup.vx v10, v8, a3
+; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmsne.vi v0, v10, 0
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; RV64-NEXT: vle32.v v10, (a0), v0.t
+; RV64-NEXT: li a0, 32
+; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT: vnsrl.wx v9, v10, a0
+; RV64-NEXT: vnsrl.wi v8, v10, 0
+; RV64-NEXT: ret
+ %rvl = mul i32 %evl, 2
+ %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1)
+ %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+ %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+; EVL should be a multiple of factor
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @invalid_evl(ptr %ptr, i32 %evl) {
+; RV32-LABEL: invalid_evl:
+; RV32: # %bb.0:
+; RV32-NEXT: ori a1, a1, 1
+; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: li a0, 32
+; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT: vnsrl.wx v12, v8, a0
+; RV32-NEXT: vnsrl.wi v14, v8, 0
+; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: vnsrl.wx v10, v14, a0
+; RV32-NEXT: vnsrl.wi v8, v14, 0
+; RV32-NEXT: vnsrl.wx v11, v12, a0
+; RV32-NEXT: vnsrl.wi v9, v12, 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: invalid_evl:
+; RV64: # %bb.0:
+; RV64-NEXT: ori a1, a1, 1
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: li a0, 32
+; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; RV64-NEXT: vnsrl.wx v12, v8, a0
+; RV64-NEXT: vnsrl.wi v14, v8, 0
+; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT: vnsrl.wx v10, v14, a0
+; RV64-NEXT: vnsrl.wi v8, v14, 0
+; RV64-NEXT: vnsrl.wx v11, v12, a0
+; RV64-NEXT: vnsrl.wi v9, v12, 0
+; RV64-NEXT: ret
+ %rvl = or i32 %evl, 1
+ %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
+ %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+ %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+ %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+ %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+ %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+ %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+ %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+ %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
+ %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+ %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+ %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+ %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+ %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+ ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
More information about the llvm-commits
mailing list