[llvm] [LV] Support scalable interleave groups for factors 3, 5, 6 and 7 (PR #141865)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Fri May 30 08:59:16 PDT 2025
https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/141865
>From d7c9f2040068291a522f526bb00ba3245086a572 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 26 May 2025 20:16:58 +0100
Subject: [PATCH 1/6] [LV] Support scalable interleave groups for factors 3,5,6
and 7
---
.../AArch64/AArch64TargetTransformInfo.cpp | 6 +
.../Transforms/Vectorize/LoopVectorize.cpp | 17 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 125 +++-
.../AArch64/sve-interleaved-accesses.ll | 50 +-
.../sve-interleaved-masked-accesses.ll | 84 +--
.../RISCV/interleaved-accesses.ll | 626 +++++++++---------
6 files changed, 474 insertions(+), 434 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 68aec80f07e1d..8cbc30b071d6b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4575,6 +4575,12 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
if (VecTy->isScalableTy() && !ST->hasSVE())
return InstructionCost::getInvalid();
+ // Currently factors 2 and 4 can be de[interleaved] with scalable vectors.
+ // TODO: Add lowering for vector.[de]interleave3 intrinsics and
+ // support in InterleavedAccessPass for ld3/st3
+ if (VecTy->isScalableTy() && Factor != 2 && Factor != 4)
+ return InstructionCost::getInvalid();
+
// Vectorization for masked interleaved accesses is only enabled for scalable
// VF.
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8040d375f0dbd..2b440c778ec1a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3193,10 +3193,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
if (hasIrregularType(ScalarTy, DL))
return false;
- // For scalable vectors, the only interleave factor currently supported
- // must be power of 2 since we require the (de)interleave2 intrinsics
- // instead of shufflevectors.
- if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
+ // For scalable vectors, the interleave factors must be <= 8 or a power of 2
+ // since we require the (de)interleaveN intrinsics instead of shufflevectors.
+ if (VF.isScalable() &&
+ !(InterleaveFactor <= 8 || isPowerOf2_32(InterleaveFactor)))
return false;
// If the group involves a non-integral pointer, we may not be able to
@@ -9058,10 +9058,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
bool Result = (VF.isVector() && // Query is illegal for VF == 1
CM.getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
- // For scalable vectors, the only interleave factor currently supported
- // must be power of 2 since we require the (de)interleave2 intrinsics
- // instead of shufflevectors.
- assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
+ // For scalable vectors, the interleave factors must be <= 8 or a power of
+ // 2 since we require the (de)interleaveN intrinsics instead of
+ // shufflevectors.
+ assert((!Result || !VF.isScalable() ||
+ (IG->getFactor() <= 8 || isPowerOf2_32(IG->getFactor()))) &&
"Unsupported interleave factor for scalable vectors");
return Result;
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 266f658b1f9c1..1d4c95633a680 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3217,6 +3217,62 @@ static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
}
+static Intrinsic::ID getInterleaveIntrinsicID(unsigned Factor) {
+ switch (Factor) {
+ case 2:
+ return Intrinsic::vector_interleave2;
+ break;
+ case 3:
+ return Intrinsic::vector_interleave3;
+ break;
+ case 4:
+ return Intrinsic::vector_interleave4;
+ break;
+ case 5:
+ return Intrinsic::vector_interleave5;
+ break;
+ case 6:
+ return Intrinsic::vector_interleave6;
+ break;
+ case 7:
+ return Intrinsic::vector_interleave7;
+ break;
+ case 8:
+ return Intrinsic::vector_interleave8;
+ break;
+ default:
+ llvm_unreachable("Unexpected factor");
+ }
+}
+
+static Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor) {
+ switch (Factor) {
+ case 2:
+ return Intrinsic::vector_deinterleave2;
+ break;
+ case 3:
+ return Intrinsic::vector_deinterleave3;
+ break;
+ case 4:
+ return Intrinsic::vector_deinterleave4;
+ break;
+ case 5:
+ return Intrinsic::vector_deinterleave5;
+ break;
+ case 6:
+ return Intrinsic::vector_deinterleave6;
+ break;
+ case 7:
+ return Intrinsic::vector_deinterleave7;
+ break;
+ case 8:
+ return Intrinsic::vector_deinterleave8;
+ break;
+ default:
+ llvm_unreachable("Unexpected factor");
+ }
+}
+
/// Return a vector containing interleaved elements from multiple
/// smaller input vectors.
static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
@@ -3233,6 +3289,14 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
+ if (Factor <= 8) {
+ VectorType *InterleaveTy = VectorType::get(
+ VecTy->getElementType(),
+ VecTy->getElementCount().multiplyCoefficientBy(Factor));
+ return Builder.CreateIntrinsic(InterleaveTy,
+ getInterleaveIntrinsicID(Factor), Vals,
+ /*FMFSource=*/nullptr, Name);
+ }
assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
"scalable vectors, must be power of 2");
SmallVector<Value *> InterleavingValues(Vals);
@@ -3333,7 +3397,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
&InterleaveFactor](Value *MaskForGaps) -> Value * {
if (State.VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
- assert(isPowerOf2_32(InterleaveFactor) &&
+ assert((InterleaveFactor <= 8 || isPowerOf2_32(InterleaveFactor)) &&
"Unsupported deinterleave factor for scalable vectors");
auto *ResBlockInMask = State.get(BlockInMask);
SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
@@ -3377,34 +3441,45 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
ArrayRef<VPValue *> VPDefs = definedValues();
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
if (VecTy->isScalableTy()) {
- assert(isPowerOf2_32(InterleaveFactor) &&
- "Unsupported deinterleave factor for scalable vectors");
-
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
- DeinterleavedValues[0] = NewLoad;
- // For the case of InterleaveFactor > 2, we will have to do recursive
- // deinterleaving, because the current available deinterleave intrinsic
- // supports only Factor of 2, otherwise it will bailout after first
- // iteration.
- // When deinterleaving, the number of values will double until we
- // have "InterleaveFactor".
- for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
- NumVectors *= 2) {
- // Deinterleave the elements within the vector
- SmallVector<Value *> TempDeinterleavedValues(NumVectors);
- for (unsigned I = 0; I < NumVectors; ++I) {
- auto *DiTy = DeinterleavedValues[I]->getType();
- TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
- Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
- /*FMFSource=*/nullptr, "strided.vec");
+
+ if (InterleaveFactor <= 8) {
+ Value *Deinterleave = State.Builder.CreateIntrinsic(
+ getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
+ NewLoad,
+ /*FMFSource=*/nullptr, "strided.vec");
+ for (unsigned I = 0; I < InterleaveFactor; I++)
+ DeinterleavedValues[I] =
+ State.Builder.CreateExtractValue(Deinterleave, I);
+ } else {
+ assert(isPowerOf2_32(InterleaveFactor) &&
+ "Unsupported deinterleave factor for scalable vectors");
+ DeinterleavedValues[0] = NewLoad;
+ // For the case of InterleaveFactor > 2, we will have to do recursive
+ // deinterleaving, because the current available deinterleave intrinsic
+ // supports only Factor of 2, otherwise it will bailout after first
+ // iteration.
+ // When deinterleaving, the number of values will double until we
+ // have "InterleaveFactor".
+ for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
+ NumVectors *= 2) {
+ // Deinterleave the elements within the vector
+ SmallVector<Value *> TempDeinterleavedValues(NumVectors);
+ for (unsigned I = 0; I < NumVectors; ++I) {
+ auto *DiTy = DeinterleavedValues[I]->getType();
+ TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
+ Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
+ /*FMFSource=*/nullptr, "strided.vec");
+ }
+ // Extract the deinterleaved values:
+ for (unsigned I = 0; I < 2; ++I)
+ for (unsigned J = 0; J < NumVectors; ++J)
+ DeinterleavedValues[NumVectors * I + J] =
+ State.Builder.CreateExtractValue(TempDeinterleavedValues[J],
+ I);
}
- // Extract the deinterleaved values:
- for (unsigned I = 0; I < 2; ++I)
- for (unsigned J = 0; J < NumVectors; ++J)
- DeinterleavedValues[NumVectors * I + J] =
- State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
}
#ifndef NDEBUG
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 6861644fc9969..10939de3e9fe0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -1479,34 +1479,24 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
-; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP7]])
-; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
-; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 0
-; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 1
-; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 1
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
+; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 3
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <vscale x 16 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
-; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 0
-; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 1
-; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
-; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP15]])
-; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 0
-; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 0
-; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 1
-; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 1
+; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
+; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 2
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 3
; CHECK-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
; CHECK-NEXT: [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
; CHECK-NEXT: [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP23]])
-; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP24]])
-; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC12]])
+; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]])
; CHECK-NEXT: store <vscale x 16 x i32> [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1595,15 +1585,11 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
-; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP11]])
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP12]])
-; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 0
-; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 1
-; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
+; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 3
; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
; CHECK-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
; CHECK-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
@@ -1622,9 +1608,7 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
; CHECK-NEXT: [[REVERSE8:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
; CHECK-NEXT: [[REVERSE9:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE8]])
-; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE9]])
-; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC10]])
+; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE8]], <vscale x 4 x i32> [[REVERSE9]])
; CHECK-NEXT: store <vscale x 16 x i32> [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 469faf67a71b3..3567aff0ace4e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -469,36 +469,26 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2
; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
-; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
-; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
-; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
+; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP15]]
; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
-; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
-; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
-; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
-; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
-; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = sext i32 [[TMP8]] to i64
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]]
+; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
+; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK1]])
; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; SCALAR_TAIL_FOLDING: middle.block:
; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -531,37 +521,27 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
+; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP15]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
-; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = sext i32 [[TMP8]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
+; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK1]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
; PREDICATED_TAIL_FOLDING: middle.block:
; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]]
; PREDICATED_TAIL_FOLDING: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index f48691bd54417..9b55973def6e9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -354,32 +354,40 @@ exit:
define void @load_store_factor3_i32(ptr %p) {
; CHECK-LABEL: @load_store_factor3_i32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3
; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP7]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 12 x i32>, ptr [[Q0]], align 4
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
+; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP8]], splat (i32 1)
+; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
+; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 3)
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP13]])
+; CHECK-NEXT: store <vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
@@ -458,32 +466,40 @@ define void @load_store_factor3_i32(ptr %p) {
;
; SCALABLE-LABEL: @load_store_factor3_i32(
; SCALABLE-NEXT: entry:
-; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
; SCALABLE: vector.ph:
+; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
; SCALABLE-NEXT: br label [[LOOP:%.*]]
; SCALABLE: vector.body:
; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3
; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
-; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
-; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; SCALABLE-NEXT: [[TMP2:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; SCALABLE-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; SCALABLE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP7]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
-; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8
-; SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 12 x i32>, ptr [[Q0]], align 4
+; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> [[WIDE_VEC]])
+; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP8]], splat (i32 1)
+; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
+; SCALABLE-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 3)
+; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP13]])
+; SCALABLE-NEXT: store <vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
; SCALABLE: middle.block:
-; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; SCALABLE: scalar.ph:
-; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
; SCALABLE-NEXT: br label [[LOOP1:%.*]]
; SCALABLE: loop:
; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
@@ -541,32 +557,40 @@ exit:
define void @load_store_factor3_i64(ptr %p) {
; CHECK-LABEL: @load_store_factor3_i64(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3
; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> [[TMP6]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP7]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 6 x i64>, ptr [[Q0]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
+; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 3)
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP13]])
+; CHECK-NEXT: store <vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
; CHECK-NEXT: br label [[LOOP1:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
@@ -645,32 +669,40 @@ define void @load_store_factor3_i64(ptr %p) {
;
; SCALABLE-LABEL: @load_store_factor3_i64(
; SCALABLE-NEXT: entry:
-; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
; SCALABLE: vector.ph:
+; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
; SCALABLE-NEXT: br label [[LOOP:%.*]]
; SCALABLE: vector.body:
; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3
; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
-; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; SCALABLE-NEXT: [[TMP2:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> [[TMP6]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP7]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
-; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4
-; SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 6 x i64>, ptr [[Q0]], align 8
+; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT: [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP8]], splat (i64 1)
+; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
+; SCALABLE-NEXT: [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 3)
+; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP13]])
+; SCALABLE-NEXT: store <vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
; SCALABLE: middle.block:
-; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; SCALABLE: scalar.ph:
-; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
; SCALABLE-NEXT: br label [[LOOP1:%.*]]
; SCALABLE: loop:
; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
@@ -745,22 +777,16 @@ define void @load_store_factor4(ptr %p) {
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[TMP7]], align 8
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP8]])
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP9]])
-; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 3
; CHECK-NEXT: [[TMP14:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP12]], splat (i64 3)
; CHECK-NEXT: [[TMP17:%.*]] = add <vscale x 2 x i64> [[TMP13]], splat (i64 4)
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP16]])
-; CHECK-NEXT: [[INTERLEAVED_VEC3:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP17]])
-; CHECK-NEXT: [[INTERLEAVED_VEC4:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC]], <vscale x 4 x i64> [[INTERLEAVED_VEC3]])
+; CHECK-NEXT: [[INTERLEAVED_VEC4:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> [[TMP17]])
; CHECK-NEXT: store <vscale x 8 x i64> [[INTERLEAVED_VEC4]], ptr [[TMP7]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -877,22 +903,16 @@ define void @load_store_factor4(ptr %p) {
; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 4
; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[TMP7]], align 8
-; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP8]])
-; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP9]])
-; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
+; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 3
; SCALABLE-NEXT: [[TMP14:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
; SCALABLE-NEXT: [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
; SCALABLE-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP12]], splat (i64 3)
; SCALABLE-NEXT: [[TMP17:%.*]] = add <vscale x 2 x i64> [[TMP13]], splat (i64 4)
-; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP16]])
-; SCALABLE-NEXT: [[INTERLEAVED_VEC3:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP17]])
-; SCALABLE-NEXT: [[INTERLEAVED_VEC4:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC]], <vscale x 4 x i64> [[INTERLEAVED_VEC3]])
+; SCALABLE-NEXT: [[INTERLEAVED_VEC4:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> [[TMP17]])
; SCALABLE-NEXT: store <vscale x 8 x i64> [[INTERLEAVED_VEC4]], ptr [[TMP7]], align 8
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; SCALABLE-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -970,38 +990,41 @@ exit:
define void @load_store_factor5(ptr %p) {
; CHECK-LABEL: @load_store_factor5(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 5
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <10 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 0, i32 5>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 1, i32 6>
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 2, i32 7>
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 3, i32 8>
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 4, i32 9>
-; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <10 x i64> [[TMP11]], <10 x i64> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9>
-; CHECK-NEXT: store <10 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 5 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave5.nxv5i64(<vscale x 5 x i64> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; CHECK-NEXT: [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vector.interleave5.nxv5i64(<vscale x 1 x i64> [[TMP10]], <vscale x 1 x i64> [[TMP11]], <vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]])
+; CHECK-NEXT: store <vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1106,38 +1129,41 @@ define void @load_store_factor5(ptr %p) {
;
; SCALABLE-LABEL: @load_store_factor5(
; SCALABLE-NEXT: entry:
-; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; SCALABLE: vector.ph:
+; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]]
; SCALABLE: vector.body:
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 5
; SCALABLE-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <10 x i64>, ptr [[TMP1]], align 8
-; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 0, i32 5>
-; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 1, i32 6>
-; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 2, i32 7>
-; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 3, i32 8>
-; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 4, i32 9>
-; SCALABLE-NEXT: [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT: [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; SCALABLE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <10 x i64> [[TMP11]], <10 x i64> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9>
-; SCALABLE-NEXT: store <10 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 5 x i64>, ptr [[TMP1]], align 8
+; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave5.nxv5i64(<vscale x 5 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; SCALABLE-NEXT: [[TMP10:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; SCALABLE-NEXT: [[TMP11:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; SCALABLE-NEXT: [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; SCALABLE-NEXT: [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vector.interleave5.nxv5i64(<vscale x 1 x i64> [[TMP10]], <vscale x 1 x i64> [[TMP11]], <vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]])
+; SCALABLE-NEXT: store <vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; SCALABLE: middle.block:
-; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; SCALABLE: scalar.ph:
-; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; SCALABLE-NEXT: br label [[LOOP:%.*]]
; SCALABLE: loop:
; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1217,41 +1243,43 @@ exit:
define void @load_store_factor6(ptr %p) {
; CHECK-LABEL: @load_store_factor6(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 6
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 0, i32 6>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 1, i32 7>
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 2, i32 8>
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 3, i32 9>
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 4, i32 10>
-; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 5, i32 11>
-; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
-; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 6 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave6.nxv6i64(<vscale x 6 x i64> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; CHECK-NEXT: [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 6)
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave6.nxv6i64(<vscale x 1 x i64> [[TMP11]], <vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]])
+; CHECK-NEXT: store <vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1369,41 +1397,43 @@ define void @load_store_factor6(ptr %p) {
;
; SCALABLE-LABEL: @load_store_factor6(
; SCALABLE-NEXT: entry:
-; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; SCALABLE: vector.ph:
+; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]]
; SCALABLE: vector.body:
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 6
; SCALABLE-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP1]], align 8
-; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 0, i32 6>
-; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 1, i32 7>
-; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 2, i32 8>
-; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 3, i32 9>
-; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 4, i32 10>
-; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 5, i32 11>
-; SCALABLE-NEXT: [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT: [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; SCALABLE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; SCALABLE-NEXT: [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
-; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 6 x i64>, ptr [[TMP1]], align 8
+; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave6.nxv6i64(<vscale x 6 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; SCALABLE-NEXT: [[TMP11:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; SCALABLE-NEXT: [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; SCALABLE-NEXT: [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; SCALABLE-NEXT: [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; SCALABLE-NEXT: [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 6)
+; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave6.nxv6i64(<vscale x 1 x i64> [[TMP11]], <vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]])
+; SCALABLE-NEXT: store <vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; SCALABLE: middle.block:
-; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; SCALABLE: scalar.ph:
-; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; SCALABLE-NEXT: br label [[LOOP:%.*]]
; SCALABLE: loop:
; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1494,45 +1524,45 @@ exit:
define void @load_store_factor7(ptr %p) {
; CHECK-LABEL: @load_store_factor7(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[INDEX]], 7
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP3]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <14 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 0, i32 7>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 1, i32 8>
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 2, i32 9>
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 3, i32 10>
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 4, i32 11>
-; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 5, i32 12>
-; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 6, i32 13>
-; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP19]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP13]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <6 x i64> [[TMP14]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> [[TMP15]], <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <14 x i64> [[TMP16]], <14 x i64> poison, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
-; CHECK-NEXT: store <14 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 7 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave7.nxv7i64(<vscale x 7 x i64> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 6
+; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; CHECK-NEXT: [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; CHECK-NEXT: [[TMP17:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 6)
+; CHECK-NEXT: [[TMP18:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 7)
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vector.interleave7.nxv7i64(<vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]])
+; CHECK-NEXT: store <vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1664,45 +1694,45 @@ define void @load_store_factor7(ptr %p) {
;
; SCALABLE-LABEL: @load_store_factor7(
; SCALABLE-NEXT: entry:
-; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; SCALABLE: vector.ph:
+; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]]
; SCALABLE: vector.body:
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[INDEX]], 7
; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP3]]
-; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <14 x i64>, ptr [[TMP4]], align 8
-; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 0, i32 7>
-; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 1, i32 8>
-; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 2, i32 9>
-; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 3, i32 10>
-; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 4, i32 11>
-; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 5, i32 12>
-; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 6, i32 13>
-; SCALABLE-NEXT: [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT: [[TMP18:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; SCALABLE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; SCALABLE-NEXT: [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; SCALABLE-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; SCALABLE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP19]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SCALABLE-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP13]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; SCALABLE-NEXT: [[TMP15:%.*]] = shufflevector <6 x i64> [[TMP14]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; SCALABLE-NEXT: [[TMP16:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> [[TMP15]], <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
-; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <14 x i64> [[TMP16]], <14 x i64> poison, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
-; SCALABLE-NEXT: store <14 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
-; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 7 x i64>, ptr [[TMP4]], align 8
+; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave7.nxv7i64(<vscale x 7 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 6
+; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; SCALABLE-NEXT: [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; SCALABLE-NEXT: [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; SCALABLE-NEXT: [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; SCALABLE-NEXT: [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; SCALABLE-NEXT: [[TMP17:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 6)
+; SCALABLE-NEXT: [[TMP18:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 7)
+; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vector.interleave7.nxv7i64(<vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]])
+; SCALABLE-NEXT: store <vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; SCALABLE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; SCALABLE: middle.block:
-; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; SCALABLE: scalar.ph:
-; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; SCALABLE-NEXT: br label [[LOOP:%.*]]
; SCALABLE: loop:
; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1818,27 +1848,15 @@ define void @load_store_factor8(ptr %p) {
; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3
; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[Q0]], align 8
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP5]])
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
-; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP7]])
-; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
-; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
-; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
-; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
-; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
-; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
-; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
-; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
-; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
-; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
-; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave8.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 6
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 7
; CHECK-NEXT: [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 1)
; CHECK-NEXT: [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 2)
; CHECK-NEXT: [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 3)
@@ -1847,13 +1865,7 @@ define void @load_store_factor8(ptr %p) {
; CHECK-NEXT: [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 6)
; CHECK-NEXT: [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 7)
; CHECK-NEXT: [[TMP27:%.*]] = add <vscale x 1 x i64> [[TMP19]], splat (i64 8)
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
-; CHECK-NEXT: [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
-; CHECK-NEXT: [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
-; CHECK-NEXT: [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP27]])
-; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
-; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
-; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
+; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave8.nxv8i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP24]], <vscale x 1 x i64> [[TMP25]], <vscale x 1 x i64> [[TMP26]], <vscale x 1 x i64> [[TMP27]])
; CHECK-NEXT: store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8
; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]]
; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
@@ -2019,27 +2031,15 @@ define void @load_store_factor8(ptr %p) {
; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3
; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[Q0]], align 8
-; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP5]])
-; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
-; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
-; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP7]])
-; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
-; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
-; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
-; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
-; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
-; SCALABLE-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
-; SCALABLE-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
-; SCALABLE-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
-; SCALABLE-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
-; SCALABLE-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
-; SCALABLE-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
+; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave8.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; SCALABLE-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; SCALABLE-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; SCALABLE-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 6
+; SCALABLE-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 7
; SCALABLE-NEXT: [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 1)
; SCALABLE-NEXT: [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 2)
; SCALABLE-NEXT: [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 3)
@@ -2048,13 +2048,7 @@ define void @load_store_factor8(ptr %p) {
; SCALABLE-NEXT: [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 6)
; SCALABLE-NEXT: [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 7)
; SCALABLE-NEXT: [[TMP27:%.*]] = add <vscale x 1 x i64> [[TMP19]], splat (i64 8)
-; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
-; SCALABLE-NEXT: [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
-; SCALABLE-NEXT: [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
-; SCALABLE-NEXT: [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP27]])
-; SCALABLE-NEXT: [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
-; SCALABLE-NEXT: [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
-; SCALABLE-NEXT: [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
+; SCALABLE-NEXT: [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave8.nxv8i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP24]], <vscale x 1 x i64> [[TMP25]], <vscale x 1 x i64> [[TMP26]], <vscale x 1 x i64> [[TMP27]])
; SCALABLE-NEXT: store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8
; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]]
; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
>From 90a552885fe8bcae8c99286955973d8079c646a4 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 28 May 2025 23:04:29 +0100
Subject: [PATCH 2/6] Use isPowerOf2_32 in AArch64 TTI check
---
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8cbc30b071d6b..7a9fdd3fe220f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4575,10 +4575,11 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
if (VecTy->isScalableTy() && !ST->hasSVE())
return InstructionCost::getInvalid();
- // Currently factors 2 and 4 can be de[interleaved] with scalable vectors.
- // TODO: Add lowering for vector.[de]interleave3 intrinsics and
- // support in InterleavedAccessPass for ld3/st3
- if (VecTy->isScalableTy() && Factor != 2 && Factor != 4)
+ // Scalable VFs will emit vector.de[interleave] intrinsics, and currently we
+ // only have lowering for power-of-2 factors.
+ // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
+ // InterleavedAccessPass for ld3/st3
+ if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
return InstructionCost::getInvalid();
// Vectorization for masked interleaved accesses is only enabled for scalable
>From c65b866bd643ca06f02faa308628c0451f0c1ff8 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 29 May 2025 18:02:07 +0100
Subject: [PATCH 3/6] Move TempDeinterleavedValues outside of the loop
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1d4c95633a680..49dc8d5a0b463 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3463,10 +3463,10 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
// iteration.
// When deinterleaving, the number of values will double until we
// have "InterleaveFactor".
+ // Deinterleave the elements within the vector
+ SmallVector<Value *> TempDeinterleavedValues(InterleaveFactor);
for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
NumVectors *= 2) {
- // Deinterleave the elements within the vector
- SmallVector<Value *> TempDeinterleavedValues(NumVectors);
for (unsigned I = 0; I < NumVectors; ++I) {
auto *DiTy = DeinterleavedValues[I]->getType();
TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
>From c28663f053b08178a619b5d0a9981ab25beab331 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 30 May 2025 12:17:55 +0100
Subject: [PATCH 4/6] Update comment
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 49dc8d5a0b463..38ac298f4b20d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3457,12 +3457,10 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
DeinterleavedValues[0] = NewLoad;
- // For the case of InterleaveFactor > 2, we will have to do recursive
- // deinterleaving, because the current available deinterleave intrinsic
- // supports only Factor of 2, otherwise it will bailout after first
- // iteration.
- // When deinterleaving, the number of values will double until we
- // have "InterleaveFactor".
+ // For InterleaveFactor > 8 we have to do recursive deinterleaving via
+ // deinterleave2, because the intrinsics only go up to Factor 8. We
+ // currently only support power-of-2 factors. When deinterleaving, the
+ // number of values will double until we have "InterleaveFactor".
// Deinterleave the elements within the vector
SmallVector<Value *> TempDeinterleavedValues(InterleaveFactor);
for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
>From b901b2d70d3965e7e19d5bc0d42a926f0ca77839 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 30 May 2025 16:07:53 +0100
Subject: [PATCH 5/6] Remove extraneous breaks
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 14 --------------
1 file changed, 14 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 38ac298f4b20d..8bd8b63bc6013 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3221,25 +3221,18 @@ static Intrinsic::ID getInterleaveIntrinsicID(unsigned Factor) {
switch (Factor) {
case 2:
return Intrinsic::vector_interleave2;
- break;
case 3:
return Intrinsic::vector_interleave3;
- break;
case 4:
return Intrinsic::vector_interleave4;
- break;
case 5:
return Intrinsic::vector_interleave5;
- break;
case 6:
return Intrinsic::vector_interleave6;
- break;
case 7:
return Intrinsic::vector_interleave7;
- break;
case 8:
return Intrinsic::vector_interleave8;
- break;
default:
llvm_unreachable("Unexpected factor");
}
@@ -3249,25 +3242,18 @@ static Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor) {
switch (Factor) {
case 2:
return Intrinsic::vector_deinterleave2;
- break;
case 3:
return Intrinsic::vector_deinterleave3;
- break;
case 4:
return Intrinsic::vector_deinterleave4;
- break;
case 5:
return Intrinsic::vector_deinterleave5;
- break;
case 6:
return Intrinsic::vector_deinterleave6;
- break;
case 7:
return Intrinsic::vector_deinterleave7;
- break;
case 8:
return Intrinsic::vector_deinterleave8;
- break;
default:
llvm_unreachable("Unexpected factor");
}
>From 203d204b39ad80de14ec5be73ea9be2efd8c4bd2 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 30 May 2025 16:58:45 +0100
Subject: [PATCH 6/6] Add test for power of 2 factor > 8
---
.../scalable-interleaved-accesses.ll | 616 ++++++++++++++++++
1 file changed, 616 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/scalable-interleaved-accesses.ll
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/scalable-interleaved-accesses.ll
new file mode 100644
index 0000000000000..8a6f6bd3b1e55
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scalable-interleaved-accesses.ll
@@ -0,0 +1,616 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-width=1 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -scalable-vectorization=on -force-target-instruction-cost=1 -force-target-supports-scalable-vectors -max-interleave-group-factor=16 < %s | FileCheck %s
+
+define void @factor8(ptr noalias %p, ptr noalias %q) {
+; CHECK-LABEL: define void @factor8(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP36]], 1024
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP38]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP40:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
+; CHECK-NEXT: [[GEPP0:%.*]] = getelementptr [8 x i32], ptr [[P]], i64 [[TMP0]], i64 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[GEPP0]], align 4
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave8.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP31:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP32:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[TMP33:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC]], 2
+; CHECK-NEXT: [[TMP37:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC]], 3
+; CHECK-NEXT: [[TMP39:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC]], 4
+; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC]], 5
+; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC]], 6
+; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC]], 7
+; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[INDEX]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr [8 x i32], ptr [[Q]], i64 [[TMP16]], i64 0
+; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP17]], align 4
+; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave8.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
+; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT: [[TMP45:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT: [[TMP46:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC2]], 2
+; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC2]], 3
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC2]], 4
+; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC2]], 5
+; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC2]], 6
+; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC2]], 7
+; CHECK-NEXT: [[TMP23:%.*]] = add <vscale x 1 x i32> [[TMP31]], [[TMP44]]
+; CHECK-NEXT: [[TMP24:%.*]] = add <vscale x 1 x i32> [[TMP32]], [[TMP45]]
+; CHECK-NEXT: [[TMP25:%.*]] = add <vscale x 1 x i32> [[TMP33]], [[TMP46]]
+; CHECK-NEXT: [[TMP26:%.*]] = add <vscale x 1 x i32> [[TMP37]], [[TMP18]]
+; CHECK-NEXT: [[TMP27:%.*]] = add <vscale x 1 x i32> [[TMP39]], [[TMP19]]
+; CHECK-NEXT: [[TMP28:%.*]] = add <vscale x 1 x i32> [[TMP41]], [[TMP20]]
+; CHECK-NEXT: [[TMP29:%.*]] = add <vscale x 1 x i32> [[TMP42]], [[TMP21]]
+; CHECK-NEXT: [[TMP30:%.*]] = add <vscale x 1 x i32> [[TMP43]], [[TMP22]]
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave8.nxv8i32(<vscale x 1 x i32> [[TMP23]], <vscale x 1 x i32> [[TMP24]], <vscale x 1 x i32> [[TMP25]], <vscale x 1 x i32> [[TMP26]], <vscale x 1 x i32> [[TMP27]], <vscale x 1 x i32> [[TMP28]], <vscale x 1 x i32> [[TMP29]], <vscale x 1 x i32> [[TMP30]])
+; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[GEPP0]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP40]]
+; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[TMP35:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP8:%.*]] = getelementptr [8 x i32], ptr [[P]], i64 [[TMP35]], i64 0
+; CHECK-NEXT: [[P0:%.*]] = load i32, ptr [[GEPP8]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ0:%.*]] = getelementptr [8 x i32], ptr [[Q]], i64 [[TMP1]], i64 0
+; CHECK-NEXT: [[Q0:%.*]] = load i32, ptr [[GEPQ0]], align 4
+; CHECK-NEXT: [[Z0:%.*]] = add i32 [[P0]], [[Q0]]
+; CHECK-NEXT: store i32 [[Z0]], ptr [[GEPP8]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP1:%.*]] = getelementptr [8 x i32], ptr [[P]], i64 [[TMP2]], i64 1
+; CHECK-NEXT: [[P1:%.*]] = load i32, ptr [[GEPP1]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ1:%.*]] = getelementptr [8 x i32], ptr [[Q]], i64 [[TMP3]], i64 1
+; CHECK-NEXT: [[Q1:%.*]] = load i32, ptr [[GEPQ1]], align 4
+; CHECK-NEXT: [[Z1:%.*]] = add i32 [[P1]], [[Q1]]
+; CHECK-NEXT: store i32 [[Z1]], ptr [[GEPP1]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP2:%.*]] = getelementptr [8 x i32], ptr [[P]], i64 [[TMP4]], i64 2
+; CHECK-NEXT: [[P2:%.*]] = load i32, ptr [[GEPP2]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ2:%.*]] = getelementptr [8 x i32], ptr [[Q]], i64 [[TMP5]], i64 2
+; CHECK-NEXT: [[Q2:%.*]] = load i32, ptr [[GEPQ2]], align 4
+; CHECK-NEXT: [[Z2:%.*]] = add i32 [[P2]], [[Q2]]
+; CHECK-NEXT: store i32 [[Z2]], ptr [[GEPP2]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP3:%.*]] = getelementptr [8 x i32], ptr [[P]], i64 [[TMP6]], i64 3
+; CHECK-NEXT: [[P3:%.*]] = load i32, ptr [[GEPP3]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ3:%.*]] = getelementptr [8 x i32], ptr [[Q]], i64 [[TMP7]], i64 3
+; CHECK-NEXT: [[Q3:%.*]] = load i32, ptr [[GEPQ3]], align 4
+; CHECK-NEXT: [[Z3:%.*]] = add i32 [[P3]], [[Q3]]
+; CHECK-NEXT: store i32 [[Z3]], ptr [[GEPP3]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP4:%.*]] = getelementptr [8 x i32], ptr [[P]], i64 [[TMP8]], i64 4
+; CHECK-NEXT: [[P4:%.*]] = load i32, ptr [[GEPP4]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ4:%.*]] = getelementptr [8 x i32], ptr [[Q]], i64 [[TMP9]], i64 4
+; CHECK-NEXT: [[Q4:%.*]] = load i32, ptr [[GEPQ4]], align 4
+; CHECK-NEXT: [[Z4:%.*]] = add i32 [[P4]], [[Q4]]
+; CHECK-NEXT: store i32 [[Z4]], ptr [[GEPP4]], align 4
+; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP5:%.*]] = getelementptr [8 x i32], ptr [[P]], i64 [[TMP10]], i64 5
+; CHECK-NEXT: [[P5:%.*]] = load i32, ptr [[GEPP5]], align 4
+; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ5:%.*]] = getelementptr [8 x i32], ptr [[Q]], i64 [[TMP11]], i64 5
+; CHECK-NEXT: [[Q5:%.*]] = load i32, ptr [[GEPQ5]], align 4
+; CHECK-NEXT: [[Z5:%.*]] = add i32 [[P5]], [[Q5]]
+; CHECK-NEXT: store i32 [[Z5]], ptr [[GEPP5]], align 4
+; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP6:%.*]] = getelementptr [8 x i32], ptr [[P]], i64 [[TMP12]], i64 6
+; CHECK-NEXT: [[P6:%.*]] = load i32, ptr [[GEPP6]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ6:%.*]] = getelementptr [8 x i32], ptr [[Q]], i64 [[TMP13]], i64 6
+; CHECK-NEXT: [[Q6:%.*]] = load i32, ptr [[GEPQ6]], align 4
+; CHECK-NEXT: [[Z6:%.*]] = add i32 [[P6]], [[Q6]]
+; CHECK-NEXT: store i32 [[Z6]], ptr [[GEPP6]], align 4
+; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP7:%.*]] = getelementptr [8 x i32], ptr [[P]], i64 [[TMP14]], i64 7
+; CHECK-NEXT: [[P15:%.*]] = load i32, ptr [[GEPP7]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ7:%.*]] = getelementptr [8 x i32], ptr [[Q]], i64 [[TMP15]], i64 7
+; CHECK-NEXT: [[Q15:%.*]] = load i32, ptr [[GEPQ7]], align 4
+; CHECK-NEXT: [[Z15:%.*]] = add i32 [[P15]], [[Q15]]
+; CHECK-NEXT: store i32 [[Z15]], ptr [[GEPP7]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT: [[DONE:%.*]] = icmp eq i32 [[IV_NEXT]], 1024
+; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+
+ %gepp0 = getelementptr [8 x i32], ptr %p, i32 %iv, i32 0
+ %p0 = load i32, ptr %gepp0
+ %gepq0 = getelementptr [8 x i32], ptr %q, i32 %iv, i32 0
+ %q0 = load i32, ptr %gepq0
+ %z0 = add i32 %p0, %q0
+ store i32 %z0, ptr %gepp0
+
+ %gepp1 = getelementptr [8 x i32], ptr %p, i32 %iv, i32 1
+ %p1 = load i32, ptr %gepp1
+ %gepq1 = getelementptr [8 x i32], ptr %q, i32 %iv, i32 1
+ %q1 = load i32, ptr %gepq1
+ %z1 = add i32 %p1, %q1
+ store i32 %z1, ptr %gepp1
+
+ %gepp2 = getelementptr [8 x i32], ptr %p, i32 %iv, i32 2
+ %p2 = load i32, ptr %gepp2
+ %gepq2 = getelementptr [8 x i32], ptr %q, i32 %iv, i32 2
+ %q2 = load i32, ptr %gepq2
+ %z2 = add i32 %p2, %q2
+ store i32 %z2, ptr %gepp2
+
+ %gepp3 = getelementptr [8 x i32], ptr %p, i32 %iv, i32 3
+ %p3 = load i32, ptr %gepp3
+ %gepq3 = getelementptr [8 x i32], ptr %q, i32 %iv, i32 3
+ %q3 = load i32, ptr %gepq3
+ %z3 = add i32 %p3, %q3
+ store i32 %z3, ptr %gepp3
+
+ %gepp4 = getelementptr [8 x i32], ptr %p, i32 %iv, i32 4
+ %p4 = load i32, ptr %gepp4
+ %gepq4 = getelementptr [8 x i32], ptr %q, i32 %iv, i32 4
+ %q4 = load i32, ptr %gepq4
+ %z4 = add i32 %p4, %q4
+ store i32 %z4, ptr %gepp4
+
+ %gepp5 = getelementptr [8 x i32], ptr %p, i32 %iv, i32 5
+ %p5 = load i32, ptr %gepp5
+ %gepq5 = getelementptr [8 x i32], ptr %q, i32 %iv, i32 5
+ %q5 = load i32, ptr %gepq5
+ %z5 = add i32 %p5, %q5
+ store i32 %z5, ptr %gepp5
+
+ %gepp6 = getelementptr [8 x i32], ptr %p, i32 %iv, i32 6
+ %p6 = load i32, ptr %gepp6
+ %gepq6 = getelementptr [8 x i32], ptr %q, i32 %iv, i32 6
+ %q6 = load i32, ptr %gepq6
+ %z6 = add i32 %p6, %q6
+ store i32 %z6, ptr %gepp6
+
+ %gepp7 = getelementptr [8 x i32], ptr %p, i32 %iv, i32 7
+ %p7 = load i32, ptr %gepp7
+ %gepq7 = getelementptr [8 x i32], ptr %q, i32 %iv, i32 7
+ %q7 = load i32, ptr %gepq7
+ %z7 = add i32 %p7, %q7
+ store i32 %z7, ptr %gepp7
+
+ %iv.next = add i32 %iv, 1
+ %done = icmp eq i32 %iv.next, 1024
+ br i1 %done, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @factor16(ptr noalias %p, ptr noalias %q) {
+; CHECK-LABEL: define void @factor16(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP35]], 1024
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP36]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
+; CHECK-NEXT: [[GEPP0:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP0]], i64 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[GEPP0]], align 4
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP87:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP88:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP87]])
+; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP88]])
+; CHECK-NEXT: [[TMP89:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 0
+; CHECK-NEXT: [[TMP90:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT: [[TMP91:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 1
+; CHECK-NEXT: [[TMP92:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> [[TMP89]])
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> [[TMP90]])
+; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> [[TMP91]])
+; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> [[TMP92]])
+; CHECK-NEXT: [[TMP93:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC3]], 0
+; CHECK-NEXT: [[TMP94:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC4]], 0
+; CHECK-NEXT: [[TMP95:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC5]], 0
+; CHECK-NEXT: [[TMP96:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC6]], 0
+; CHECK-NEXT: [[TMP97:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC3]], 1
+; CHECK-NEXT: [[TMP98:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC4]], 1
+; CHECK-NEXT: [[TMP99:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC5]], 1
+; CHECK-NEXT: [[TMP100:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC6]], 1
+; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP93]])
+; CHECK-NEXT: [[STRIDED_VEC8:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP94]])
+; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP95]])
+; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP96]])
+; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP97]])
+; CHECK-NEXT: [[STRIDED_VEC12:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP98]])
+; CHECK-NEXT: [[STRIDED_VEC13:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP99]])
+; CHECK-NEXT: [[STRIDED_VEC14:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP100]])
+; CHECK-NEXT: [[TMP101:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC7]], 0
+; CHECK-NEXT: [[TMP102:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC8]], 0
+; CHECK-NEXT: [[TMP103:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC9]], 0
+; CHECK-NEXT: [[TMP104:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC10]], 0
+; CHECK-NEXT: [[TMP105:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC11]], 0
+; CHECK-NEXT: [[TMP106:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC12]], 0
+; CHECK-NEXT: [[TMP107:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC13]], 0
+; CHECK-NEXT: [[TMP108:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC14]], 0
+; CHECK-NEXT: [[TMP109:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC7]], 1
+; CHECK-NEXT: [[TMP110:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC8]], 1
+; CHECK-NEXT: [[TMP111:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC9]], 1
+; CHECK-NEXT: [[TMP112:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC10]], 1
+; CHECK-NEXT: [[TMP113:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC11]], 1
+; CHECK-NEXT: [[TMP32:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC12]], 1
+; CHECK-NEXT: [[TMP33:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC13]], 1
+; CHECK-NEXT: [[TMP34:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC14]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
+; CHECK-NEXT: [[GEPQ0:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP1]], i64 0
+; CHECK-NEXT: [[WIDE_VEC15:%.*]] = load <vscale x 16 x i32>, ptr [[GEPQ0]], align 4
+; CHECK-NEXT: [[STRIDED_VEC16:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC15]])
+; CHECK-NEXT: [[TMP37:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC16]], 0
+; CHECK-NEXT: [[TMP38:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC16]], 1
+; CHECK-NEXT: [[STRIDED_VEC17:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP37]])
+; CHECK-NEXT: [[STRIDED_VEC18:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP38]])
+; CHECK-NEXT: [[TMP39:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC17]], 0
+; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC18]], 0
+; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC17]], 1
+; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC18]], 1
+; CHECK-NEXT: [[STRIDED_VEC19:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> [[TMP39]])
+; CHECK-NEXT: [[STRIDED_VEC20:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> [[TMP40]])
+; CHECK-NEXT: [[STRIDED_VEC21:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> [[TMP41]])
+; CHECK-NEXT: [[STRIDED_VEC22:%.*]] = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> [[TMP42]])
+; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC19]], 0
+; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC20]], 0
+; CHECK-NEXT: [[TMP45:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC21]], 0
+; CHECK-NEXT: [[TMP46:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC22]], 0
+; CHECK-NEXT: [[TMP47:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC19]], 1
+; CHECK-NEXT: [[TMP48:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC20]], 1
+; CHECK-NEXT: [[TMP49:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC21]], 1
+; CHECK-NEXT: [[TMP50:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[STRIDED_VEC22]], 1
+; CHECK-NEXT: [[STRIDED_VEC23:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP43]])
+; CHECK-NEXT: [[STRIDED_VEC24:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP44]])
+; CHECK-NEXT: [[STRIDED_VEC25:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP45]])
+; CHECK-NEXT: [[STRIDED_VEC26:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP46]])
+; CHECK-NEXT: [[STRIDED_VEC27:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP47]])
+; CHECK-NEXT: [[STRIDED_VEC28:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP48]])
+; CHECK-NEXT: [[STRIDED_VEC29:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP49]])
+; CHECK-NEXT: [[STRIDED_VEC30:%.*]] = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv2i32(<vscale x 2 x i32> [[TMP50]])
+; CHECK-NEXT: [[TMP51:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC23]], 0
+; CHECK-NEXT: [[TMP52:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC24]], 0
+; CHECK-NEXT: [[TMP53:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC25]], 0
+; CHECK-NEXT: [[TMP54:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC26]], 0
+; CHECK-NEXT: [[TMP55:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC27]], 0
+; CHECK-NEXT: [[TMP56:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC28]], 0
+; CHECK-NEXT: [[TMP57:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC29]], 0
+; CHECK-NEXT: [[TMP58:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC30]], 0
+; CHECK-NEXT: [[TMP59:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC23]], 1
+; CHECK-NEXT: [[TMP60:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC24]], 1
+; CHECK-NEXT: [[TMP61:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC25]], 1
+; CHECK-NEXT: [[TMP62:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC26]], 1
+; CHECK-NEXT: [[TMP63:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC27]], 1
+; CHECK-NEXT: [[TMP64:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC28]], 1
+; CHECK-NEXT: [[TMP65:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC29]], 1
+; CHECK-NEXT: [[TMP66:%.*]] = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[STRIDED_VEC30]], 1
+; CHECK-NEXT: [[TMP67:%.*]] = add <vscale x 1 x i32> [[TMP101]], [[TMP51]]
+; CHECK-NEXT: [[TMP68:%.*]] = add <vscale x 1 x i32> [[TMP102]], [[TMP52]]
+; CHECK-NEXT: [[TMP69:%.*]] = add <vscale x 1 x i32> [[TMP103]], [[TMP53]]
+; CHECK-NEXT: [[TMP70:%.*]] = add <vscale x 1 x i32> [[TMP104]], [[TMP54]]
+; CHECK-NEXT: [[TMP71:%.*]] = add <vscale x 1 x i32> [[TMP105]], [[TMP55]]
+; CHECK-NEXT: [[TMP72:%.*]] = add <vscale x 1 x i32> [[TMP106]], [[TMP56]]
+; CHECK-NEXT: [[TMP73:%.*]] = add <vscale x 1 x i32> [[TMP107]], [[TMP57]]
+; CHECK-NEXT: [[TMP74:%.*]] = add <vscale x 1 x i32> [[TMP108]], [[TMP58]]
+; CHECK-NEXT: [[TMP75:%.*]] = add <vscale x 1 x i32> [[TMP109]], [[TMP59]]
+; CHECK-NEXT: [[TMP76:%.*]] = add <vscale x 1 x i32> [[TMP110]], [[TMP60]]
+; CHECK-NEXT: [[TMP77:%.*]] = add <vscale x 1 x i32> [[TMP111]], [[TMP61]]
+; CHECK-NEXT: [[TMP78:%.*]] = add <vscale x 1 x i32> [[TMP112]], [[TMP62]]
+; CHECK-NEXT: [[TMP79:%.*]] = add <vscale x 1 x i32> [[TMP113]], [[TMP63]]
+; CHECK-NEXT: [[TMP80:%.*]] = add <vscale x 1 x i32> [[TMP32]], [[TMP64]]
+; CHECK-NEXT: [[TMP81:%.*]] = add <vscale x 1 x i32> [[TMP33]], [[TMP65]]
+; CHECK-NEXT: [[TMP82:%.*]] = add <vscale x 1 x i32> [[TMP34]], [[TMP66]]
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> [[TMP67]], <vscale x 1 x i32> [[TMP75]])
+; CHECK-NEXT: [[INTERLEAVED_VEC31:%.*]] = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> [[TMP68]], <vscale x 1 x i32> [[TMP76]])
+; CHECK-NEXT: [[INTERLEAVED_VEC32:%.*]] = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> [[TMP69]], <vscale x 1 x i32> [[TMP77]])
+; CHECK-NEXT: [[INTERLEAVED_VEC33:%.*]] = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> [[TMP70]], <vscale x 1 x i32> [[TMP78]])
+; CHECK-NEXT: [[INTERLEAVED_VEC34:%.*]] = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> [[TMP71]], <vscale x 1 x i32> [[TMP79]])
+; CHECK-NEXT: [[INTERLEAVED_VEC35:%.*]] = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> [[TMP72]], <vscale x 1 x i32> [[TMP80]])
+; CHECK-NEXT: [[INTERLEAVED_VEC36:%.*]] = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> [[TMP73]], <vscale x 1 x i32> [[TMP81]])
+; CHECK-NEXT: [[INTERLEAVED_VEC37:%.*]] = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> [[TMP74]], <vscale x 1 x i32> [[TMP82]])
+; CHECK-NEXT: [[INTERLEAVED_VEC38:%.*]] = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> [[INTERLEAVED_VEC]], <vscale x 2 x i32> [[INTERLEAVED_VEC34]])
+; CHECK-NEXT: [[INTERLEAVED_VEC39:%.*]] = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> [[INTERLEAVED_VEC31]], <vscale x 2 x i32> [[INTERLEAVED_VEC35]])
+; CHECK-NEXT: [[INTERLEAVED_VEC40:%.*]] = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> [[INTERLEAVED_VEC32]], <vscale x 2 x i32> [[INTERLEAVED_VEC36]])
+; CHECK-NEXT: [[INTERLEAVED_VEC41:%.*]] = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> [[INTERLEAVED_VEC33]], <vscale x 2 x i32> [[INTERLEAVED_VEC37]])
+; CHECK-NEXT: [[INTERLEAVED_VEC42:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[INTERLEAVED_VEC38]], <vscale x 4 x i32> [[INTERLEAVED_VEC40]])
+; CHECK-NEXT: [[INTERLEAVED_VEC43:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[INTERLEAVED_VEC39]], <vscale x 4 x i32> [[INTERLEAVED_VEC41]])
+; CHECK-NEXT: [[INTERLEAVED_VEC44:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC42]], <vscale x 8 x i32> [[INTERLEAVED_VEC43]])
+; CHECK-NEXT: store <vscale x 16 x i32> [[INTERLEAVED_VEC44]], ptr [[GEPP0]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP86]]
+; CHECK-NEXT: [[TMP83:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP83]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[TMP84:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP16:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP84]], i64 0
+; CHECK-NEXT: [[P0:%.*]] = load i32, ptr [[GEPP16]], align 4
+; CHECK-NEXT: [[TMP85:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ16:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP85]], i64 0
+; CHECK-NEXT: [[Q0:%.*]] = load i32, ptr [[GEPQ16]], align 4
+; CHECK-NEXT: [[Z0:%.*]] = add i32 [[P0]], [[Q0]]
+; CHECK-NEXT: store i32 [[Z0]], ptr [[GEPP16]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP1:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP2]], i64 1
+; CHECK-NEXT: [[P1:%.*]] = load i32, ptr [[GEPP1]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ1:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP3]], i64 1
+; CHECK-NEXT: [[Q1:%.*]] = load i32, ptr [[GEPQ1]], align 4
+; CHECK-NEXT: [[Z1:%.*]] = add i32 [[P1]], [[Q1]]
+; CHECK-NEXT: store i32 [[Z1]], ptr [[GEPP1]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP2:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP4]], i64 2
+; CHECK-NEXT: [[P2:%.*]] = load i32, ptr [[GEPP2]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ2:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP5]], i64 2
+; CHECK-NEXT: [[Q2:%.*]] = load i32, ptr [[GEPQ2]], align 4
+; CHECK-NEXT: [[Z2:%.*]] = add i32 [[P2]], [[Q2]]
+; CHECK-NEXT: store i32 [[Z2]], ptr [[GEPP2]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP3:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP6]], i64 3
+; CHECK-NEXT: [[P3:%.*]] = load i32, ptr [[GEPP3]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ3:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP7]], i64 3
+; CHECK-NEXT: [[Q3:%.*]] = load i32, ptr [[GEPQ3]], align 4
+; CHECK-NEXT: [[Z3:%.*]] = add i32 [[P3]], [[Q3]]
+; CHECK-NEXT: store i32 [[Z3]], ptr [[GEPP3]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP4:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP8]], i64 4
+; CHECK-NEXT: [[P4:%.*]] = load i32, ptr [[GEPP4]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ4:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP9]], i64 4
+; CHECK-NEXT: [[Q4:%.*]] = load i32, ptr [[GEPQ4]], align 4
+; CHECK-NEXT: [[Z4:%.*]] = add i32 [[P4]], [[Q4]]
+; CHECK-NEXT: store i32 [[Z4]], ptr [[GEPP4]], align 4
+; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP5:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP10]], i64 5
+; CHECK-NEXT: [[P5:%.*]] = load i32, ptr [[GEPP5]], align 4
+; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ5:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP11]], i64 5
+; CHECK-NEXT: [[Q5:%.*]] = load i32, ptr [[GEPQ5]], align 4
+; CHECK-NEXT: [[Z5:%.*]] = add i32 [[P5]], [[Q5]]
+; CHECK-NEXT: store i32 [[Z5]], ptr [[GEPP5]], align 4
+; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP6:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP12]], i64 6
+; CHECK-NEXT: [[P6:%.*]] = load i32, ptr [[GEPP6]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ6:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP13]], i64 6
+; CHECK-NEXT: [[Q6:%.*]] = load i32, ptr [[GEPQ6]], align 4
+; CHECK-NEXT: [[Z6:%.*]] = add i32 [[P6]], [[Q6]]
+; CHECK-NEXT: store i32 [[Z6]], ptr [[GEPP6]], align 4
+; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP7:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP14]], i64 7
+; CHECK-NEXT: [[P7:%.*]] = load i32, ptr [[GEPP7]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ7:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP15]], i64 7
+; CHECK-NEXT: [[Q7:%.*]] = load i32, ptr [[GEPQ7]], align 4
+; CHECK-NEXT: [[Z7:%.*]] = add i32 [[P7]], [[Q7]]
+; CHECK-NEXT: store i32 [[Z7]], ptr [[GEPP7]], align 4
+; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP8:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP16]], i64 8
+; CHECK-NEXT: [[P8:%.*]] = load i32, ptr [[GEPP8]], align 4
+; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ8:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP17]], i64 8
+; CHECK-NEXT: [[Q8:%.*]] = load i32, ptr [[GEPQ8]], align 4
+; CHECK-NEXT: [[Z8:%.*]] = add i32 [[P8]], [[Q8]]
+; CHECK-NEXT: store i32 [[Z8]], ptr [[GEPP8]], align 4
+; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP9:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP18]], i64 9
+; CHECK-NEXT: [[P9:%.*]] = load i32, ptr [[GEPP9]], align 4
+; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ9:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP19]], i64 9
+; CHECK-NEXT: [[Q9:%.*]] = load i32, ptr [[GEPQ9]], align 4
+; CHECK-NEXT: [[Z9:%.*]] = add i32 [[P9]], [[Q9]]
+; CHECK-NEXT: store i32 [[Z9]], ptr [[GEPP9]], align 4
+; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP10:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP20]], i64 10
+; CHECK-NEXT: [[P10:%.*]] = load i32, ptr [[GEPP10]], align 4
+; CHECK-NEXT: [[TMP21:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ10:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP21]], i64 10
+; CHECK-NEXT: [[Q10:%.*]] = load i32, ptr [[GEPQ10]], align 4
+; CHECK-NEXT: [[Z10:%.*]] = add i32 [[P10]], [[Q10]]
+; CHECK-NEXT: store i32 [[Z10]], ptr [[GEPP10]], align 4
+; CHECK-NEXT: [[TMP22:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP11:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP22]], i64 11
+; CHECK-NEXT: [[P11:%.*]] = load i32, ptr [[GEPP11]], align 4
+; CHECK-NEXT: [[TMP23:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ11:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP23]], i64 11
+; CHECK-NEXT: [[Q11:%.*]] = load i32, ptr [[GEPQ11]], align 4
+; CHECK-NEXT: [[Z11:%.*]] = add i32 [[P11]], [[Q11]]
+; CHECK-NEXT: store i32 [[Z11]], ptr [[GEPP11]], align 4
+; CHECK-NEXT: [[TMP24:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP12:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP24]], i64 12
+; CHECK-NEXT: [[P12:%.*]] = load i32, ptr [[GEPP12]], align 4
+; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ12:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP25]], i64 12
+; CHECK-NEXT: [[Q12:%.*]] = load i32, ptr [[GEPQ12]], align 4
+; CHECK-NEXT: [[Z12:%.*]] = add i32 [[P12]], [[Q12]]
+; CHECK-NEXT: store i32 [[Z12]], ptr [[GEPP12]], align 4
+; CHECK-NEXT: [[TMP26:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP13:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP26]], i64 13
+; CHECK-NEXT: [[P13:%.*]] = load i32, ptr [[GEPP13]], align 4
+; CHECK-NEXT: [[TMP27:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ13:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP27]], i64 13
+; CHECK-NEXT: [[Q13:%.*]] = load i32, ptr [[GEPQ13]], align 4
+; CHECK-NEXT: [[Z13:%.*]] = add i32 [[P13]], [[Q13]]
+; CHECK-NEXT: store i32 [[Z13]], ptr [[GEPP13]], align 4
+; CHECK-NEXT: [[TMP28:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP14:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP28]], i64 14
+; CHECK-NEXT: [[P14:%.*]] = load i32, ptr [[GEPP14]], align 4
+; CHECK-NEXT: [[TMP29:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ14:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP29]], i64 14
+; CHECK-NEXT: [[Q14:%.*]] = load i32, ptr [[GEPQ14]], align 4
+; CHECK-NEXT: [[Z14:%.*]] = add i32 [[P14]], [[Q14]]
+; CHECK-NEXT: store i32 [[Z14]], ptr [[GEPP14]], align 4
+; CHECK-NEXT: [[TMP30:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPP15:%.*]] = getelementptr [16 x i32], ptr [[P]], i64 [[TMP30]], i64 15
+; CHECK-NEXT: [[P15:%.*]] = load i32, ptr [[GEPP15]], align 4
+; CHECK-NEXT: [[TMP31:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEPQ15:%.*]] = getelementptr [16 x i32], ptr [[Q]], i64 [[TMP31]], i64 15
+; CHECK-NEXT: [[Q15:%.*]] = load i32, ptr [[GEPQ15]], align 4
+; CHECK-NEXT: [[Z15:%.*]] = add i32 [[P15]], [[Q15]]
+; CHECK-NEXT: store i32 [[Z15]], ptr [[GEPP15]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT: [[DONE:%.*]] = icmp eq i32 [[IV_NEXT]], 1024
+; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+
+ %gepp0 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 0
+ %p0 = load i32, ptr %gepp0
+ %gepq0 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 0
+ %q0 = load i32, ptr %gepq0
+ %z0 = add i32 %p0, %q0
+ store i32 %z0, ptr %gepp0
+
+ %gepp1 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 1
+ %p1 = load i32, ptr %gepp1
+ %gepq1 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 1
+ %q1 = load i32, ptr %gepq1
+ %z1 = add i32 %p1, %q1
+ store i32 %z1, ptr %gepp1
+
+ %gepp2 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 2
+ %p2 = load i32, ptr %gepp2
+ %gepq2 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 2
+ %q2 = load i32, ptr %gepq2
+ %z2 = add i32 %p2, %q2
+ store i32 %z2, ptr %gepp2
+
+ %gepp3 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 3
+ %p3 = load i32, ptr %gepp3
+ %gepq3 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 3
+ %q3 = load i32, ptr %gepq3
+ %z3 = add i32 %p3, %q3
+ store i32 %z3, ptr %gepp3
+
+ %gepp4 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 4
+ %p4 = load i32, ptr %gepp4
+ %gepq4 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 4
+ %q4 = load i32, ptr %gepq4
+ %z4 = add i32 %p4, %q4
+ store i32 %z4, ptr %gepp4
+
+ %gepp5 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 5
+ %p5 = load i32, ptr %gepp5
+ %gepq5 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 5
+ %q5 = load i32, ptr %gepq5
+ %z5 = add i32 %p5, %q5
+ store i32 %z5, ptr %gepp5
+
+ %gepp6 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 6
+ %p6 = load i32, ptr %gepp6
+ %gepq6 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 6
+ %q6 = load i32, ptr %gepq6
+ %z6 = add i32 %p6, %q6
+ store i32 %z6, ptr %gepp6
+
+ %gepp7 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 7
+ %p7 = load i32, ptr %gepp7
+ %gepq7 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 7
+ %q7 = load i32, ptr %gepq7
+ %z7 = add i32 %p7, %q7
+ store i32 %z7, ptr %gepp7
+
+ %gepp8 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 8
+ %p8 = load i32, ptr %gepp8
+ %gepq8 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 8
+ %q8 = load i32, ptr %gepq8
+ %z8 = add i32 %p8, %q8
+ store i32 %z8, ptr %gepp8
+
+ %gepp9 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 9
+ %p9 = load i32, ptr %gepp9
+ %gepq9 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 9
+ %q9 = load i32, ptr %gepq9
+ %z9 = add i32 %p9, %q9
+ store i32 %z9, ptr %gepp9
+
+ %gepp10 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 10
+ %p10 = load i32, ptr %gepp10
+ %gepq10 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 10
+ %q10 = load i32, ptr %gepq10
+ %z10 = add i32 %p10, %q10
+ store i32 %z10, ptr %gepp10
+
+ %gepp11 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 11
+ %p11 = load i32, ptr %gepp11
+ %gepq11 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 11
+ %q11 = load i32, ptr %gepq11
+ %z11 = add i32 %p11, %q11
+ store i32 %z11, ptr %gepp11
+
+ %gepp12 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 12
+ %p12 = load i32, ptr %gepp12
+ %gepq12 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 12
+ %q12 = load i32, ptr %gepq12
+ %z12 = add i32 %p12, %q12
+ store i32 %z12, ptr %gepp12
+
+ %gepp13 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 13
+ %p13 = load i32, ptr %gepp13
+ %gepq13 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 13
+ %q13 = load i32, ptr %gepq13
+ %z13 = add i32 %p13, %q13
+ store i32 %z13, ptr %gepp13
+
+ %gepp14 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 14
+ %p14 = load i32, ptr %gepp14
+ %gepq14 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 14
+ %q14 = load i32, ptr %gepq14
+ %z14 = add i32 %p14, %q14
+ store i32 %z14, ptr %gepp14
+
+ %gepp15 = getelementptr [16 x i32], ptr %p, i32 %iv, i32 15
+ %p15 = load i32, ptr %gepp15
+ %gepq15 = getelementptr [16 x i32], ptr %q, i32 %iv, i32 15
+ %q15 = load i32, ptr %gepq15
+ %z15 = add i32 %p15, %q15
+ store i32 %z15, ptr %gepp15
+
+ %iv.next = add i32 %iv, 1
+ %done = icmp eq i32 %iv.next, 1024
+ br i1 %done, label %exit, label %loop
+
+exit:
+ ret void
+}
More information about the llvm-commits
mailing list