[llvm] Reland "[LV]: Teach LV to recursively (de)interleave." (PR #125094)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 30 09:17:59 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-vectorizers
Author: Hassnaa Hamdi (hassnaaHamdi)
<details>
<summary>Changes</summary>
This patch relands the changes from "[LV]: Teach LV to recursively (de)interleave.#<!-- -->122989"
Reason for revert:
- The patch exposed an assert in the vectorizer related to VF difference between
legacy cost model and VPlan-based cost model because of uncalculated cost for
VPInstruction which is created by VPlanTransforms as a replacement to 'or disjoint'
instruction.
- The fix is about skipping comparing legacy model to vplan-based model for at least
interleaving factor > 2.
---
Patch is 207.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125094.diff
7 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+13-7)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+56-23)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses-cost.ll (+108-11)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll (+259-1)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll (+252)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll (+678-640)
- (added) llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll (+135)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5375d2be9c8751..1b05e09a95ab6d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3400,10 +3400,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
if (hasIrregularType(ScalarTy, DL))
return false;
- // We currently only know how to emit interleave/deinterleave with
- // Factor=2 for scalable vectors. This is purely an implementation
- // limit.
- if (VF.isScalable() && InterleaveFactor != 2)
+ // For scalable vectors, the only interleave factor currently supported
+ // must be power of 2 since we require the (de)interleave2 intrinsics
+ // instead of shufflevectors.
+ if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
return false;
// If the group involves a non-integral pointer, we may not be able to
@@ -7434,6 +7434,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
for (VPRecipeBase &R : *VPBB) {
if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
auto *IG = IR->getInterleaveGroup();
+ // The legacy-based cost model is more accurate for interleaving and
+ // comparing against the VPlan-based cost isn't desirable.
+ // At least skip interleaving with factor > 2 as higher factors
+ // cause higher cost difference.
+ if (IG->getFactor() > 2)
+ return true;
unsigned NumMembers = IG->getNumMembers();
for (unsigned I = 0; I != NumMembers; ++I) {
if (Instruction *M = IG->getMember(I))
@@ -9279,9 +9285,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
CM.getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
// For scalable vectors, the only interleave factor currently supported
- // is 2 since we require the (de)interleave2 intrinsics instead of
- // shufflevectors.
- assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
+ // must be power of 2 since we require the (de)interleave2 intrinsics
+ // instead of shufflevectors.
+ assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
"Unsupported interleave factor for scalable vectors");
return Result;
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2679ed6b26b5d1..7b2fab146945bf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2868,10 +2868,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
- VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
- return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
- Vals,
- /*FMFSource=*/nullptr, Name);
+ assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
+ "scalable vectors, must be power of 2");
+ SmallVector<Value *> InterleavingValues(Vals);
+ // When interleaving, the number of values will be shrunk until we have the
+ // single final interleaved value.
+ auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType());
+ for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
+ InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
+ for (unsigned I = 0; I < Midpoint; ++I)
+ InterleavingValues[I] = Builder.CreateIntrinsic(
+ InterleaveTy, Intrinsic::vector_interleave2,
+ {InterleavingValues[I], InterleavingValues[Midpoint + I]},
+ /*FMFSource=*/nullptr, Name);
+ }
+ return InterleavingValues[0];
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2957,15 +2968,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
&InterleaveFactor](Value *MaskForGaps) -> Value * {
if (State.VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
- assert(InterleaveFactor == 2 &&
+ assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
auto *ResBlockInMask = State.get(BlockInMask);
- SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
- auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
- State.VF.getKnownMinValue() * 2, true);
- return State.Builder.CreateIntrinsic(
- MaskTy, Intrinsic::vector_interleave2, Ops,
- /*FMFSource=*/nullptr, "interleaved.mask");
+ SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
+ return interleaveVectors(State.Builder, Ops, "interleaved.mask");
}
if (!BlockInMask)
@@ -3005,22 +3012,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
ArrayRef<VPValue *> VPDefs = definedValues();
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
if (VecTy->isScalableTy()) {
- assert(InterleaveFactor == 2 &&
+ assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
- // Scalable vectors cannot use arbitrary shufflevectors (only splats),
- // so must use intrinsics to deinterleave.
- Value *DI = State.Builder.CreateIntrinsic(
- Intrinsic::vector_deinterleave2, VecTy, NewLoad,
- /*FMFSource=*/nullptr, "strided.vec");
- unsigned J = 0;
- for (unsigned I = 0; I < InterleaveFactor; ++I) {
- Instruction *Member = Group->getMember(I);
+ // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+ // so must use intrinsics to deinterleave.
+ SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
+ DeinterleavedValues[0] = NewLoad;
+ // For the case of InterleaveFactor > 2, we will have to do recursive
+ // deinterleaving, because the current available deinterleave intrinsic
+ // supports only Factor of 2, otherwise it will bailout after first
+ // iteration.
+ // When deinterleaving, the number of values will double until we
+ // have "InterleaveFactor".
+ for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
+ NumVectors *= 2) {
+ // Deinterleave the elements within the vector
+ SmallVector<Value *> TempDeinterleavedValues(NumVectors);
+ for (unsigned I = 0; I < NumVectors; ++I) {
+ auto *DiTy = DeinterleavedValues[I]->getType();
+ TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
+ Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
+ /*FMFSource=*/nullptr, "strided.vec");
+ }
+ // Extract the deinterleaved values:
+ for (unsigned I = 0; I < 2; ++I)
+ for (unsigned J = 0; J < NumVectors; ++J)
+ DeinterleavedValues[NumVectors * I + J] =
+ State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
+ }
- if (!Member)
+#ifndef NDEBUG
+ for (Value *Val : DeinterleavedValues)
+ assert(Val && "NULL Deinterleaved Value");
+#endif
+ for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
+ Value *StridedVec = DeinterleavedValues[I];
+ if (!Member) {
+ // This value is not needed as it's not used
+ cast<Instruction>(StridedVec)->eraseFromParent();
continue;
-
- Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
+ }
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses-cost.ll
index 564fba65e62389..5c6415d976539f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses-cost.ll
@@ -10,32 +10,123 @@ define void @test_masked_interleave(ptr noalias %A, ptr noalias %B, ptr noalias
; CHECK-LABEL: define void @test_masked_interleave(
; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
-; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 252, [[TMP3]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 252, [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 4
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 16 x i64> @llvm.stepvector.nxv16i64()
+; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], splat (i64 4)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = mul i64 4, [[TMP7]]
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP11]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-NEXT: [[TMP13:%.*]] = mul <vscale x 16 x i32> [[TMP12]], splat (i32 4)
+; CHECK-NEXT: [[INDUCTION1:%.*]] = add <vscale x 16 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP7]] to i32
+; CHECK-NEXT: [[TMP15:%.*]] = mul i32 4, [[TMP14]]
+; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP15]], i64 0
+; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT2]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x ptr> poison, ptr [[C]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 16 x ptr> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <vscale x 16 x ptr> poison, ptr [[B]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <vscale x 16 x ptr> [[BROADCAST_SPLATINSERT13]], <vscale x 16 x ptr> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND4:%.*]] = phi <vscale x 16 x i32> [ [[INDUCTION1]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT: [[IV:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[IV_1:%.*]] = or disjoint i64 [[IV]], 1
; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV_1]]
-; CHECK-NEXT: [[L_1:%.*]] = load i8, ptr [[GEP_A_1]], align 1
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 64 x i8>, ptr [[GEP_A_1]], align 1
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP19]])
+; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP20]])
+; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC6]], 0
+; CHECK-NEXT: [[TMP22:%.*]] = icmp eq <vscale x 16 x i8> [[TMP21]], zeroinitializer
+; CHECK-NEXT: [[TMP23:%.*]] = add <vscale x 16 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[A]], <vscale x 16 x i64> [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <vscale x 16 x ptr> [[TMP24]], i32 0
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i32 -2
+; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP22]], <vscale x 16 x i1> [[TMP22]])
+; CHECK-NEXT: [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP22]], <vscale x 16 x i1> [[TMP22]])
+; CHECK-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP26]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]], <vscale x 64 x i8> poison)
+; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; CHECK-NEXT: [[TMP27:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC10]], 0
+; CHECK-NEXT: [[TMP28:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC10]], 1
+; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP27]])
+; CHECK-NEXT: [[STRIDED_VEC12:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP28]])
+; CHECK-NEXT: [[TMP29:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC11]], 0
+; CHECK-NEXT: [[TMP30:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC12]], 0
+; CHECK-NEXT: [[TMP31:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC11]], 1
+; CHECK-NEXT: [[TMP32:%.*]] = zext <vscale x 16 x i8> [[TMP31]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP33:%.*]] = shl <vscale x 16 x i32> [[TMP32]], splat (i32 2)
+; CHECK-NEXT: [[TMP34:%.*]] = zext <vscale x 16 x i8> [[TMP30]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP35:%.*]] = shl <vscale x 16 x i32> [[TMP34]], splat (i32 2)
+; CHECK-NEXT: [[TMP36:%.*]] = or <vscale x 16 x i32> [[TMP35]], [[TMP33]]
+; CHECK-NEXT: [[TMP37:%.*]] = zext <vscale x 16 x i8> [[TMP29]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP38:%.*]] = or <vscale x 16 x i32> [[TMP36]], [[TMP37]]
+; CHECK-NEXT: [[TMP39:%.*]] = shl <vscale x 16 x i32> [[TMP38]], splat (i32 2)
+; CHECK-NEXT: [[TMP40:%.*]] = or <vscale x 16 x i32> splat (i32 3), [[VEC_IND4]]
+; CHECK-NEXT: [[TMP41:%.*]] = or <vscale x 16 x i32> [[TMP39]], [[TMP40]]
+; CHECK-NEXT: [[TMP42:%.*]] = lshr <vscale x 16 x i32> [[TMP41]], splat (i32 2)
+; CHECK-NEXT: [[TMP43:%.*]] = lshr <vscale x 16 x i32> [[TMP32]], splat (i32 2)
+; CHECK-NEXT: [[TMP44:%.*]] = trunc <vscale x 16 x i32> [[TMP43]] to <vscale x 16 x i8>
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP44]], <vscale x 16 x ptr> [[TMP24]], i32 1, <vscale x 16 x i1> [[TMP22]])
+; CHECK-NEXT: [[TMP45:%.*]] = lshr <vscale x 16 x i32> [[TMP32]], splat (i32 5)
+; CHECK-NEXT: [[TMP46:%.*]] = trunc <vscale x 16 x i32> [[TMP45]] to <vscale x 16 x i8>
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP46]], <vscale x 16 x ptr> [[BROADCAST_SPLAT]], i32 1, <vscale x 16 x i1> [[TMP22]])
+; CHECK-NEXT: [[TMP47:%.*]] = trunc <vscale x 16 x i32> [[TMP42]] to <vscale x 16 x i8>
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP47]], <vscale x 16 x ptr> [[BROADCAST_SPLAT14]], i32 1, <vscale x 16 x i1> [[TMP22]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT: [[VEC_IND_NEXT5]] = add <vscale x 16 x i32> [[VEC_IND4]], [[DOTSPLAT3]]
+; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK: [[LOOP_HEADER]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[IV_3:%.*]] = or disjoint i64 [[IV1]], 1
+; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV_3]]
+; CHECK-NEXT: [[L_1:%.*]] = load i8, ptr [[GEP_A_2]], align 1
; CHECK-NEXT: [[C_1:%.*]] = icmp eq i8 [[L_1]], 0
; CHECK-NEXT: br i1 [[C_1]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[IV_2:%.*]] = or disjoint i64 [[IV]], 2
+; CHECK-NEXT: [[IV_2:%.*]] = or disjoint i64 [[IV1]], 2
; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV_2]]
; CHECK-NEXT: [[L_2:%.*]] = load i8, ptr [[ARRAYIDX7]], align 1
; CHECK-NEXT: [[CONV8:%.*]] = zext i8 [[L_2]] to i32
; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[CONV8]], 2
-; CHECK-NEXT: [[ADD9:%.*]] = or disjoint i64 [[IV]], 1
+; CHECK-NEXT: [[ADD9:%.*]] = or disjoint i64 [[IV1]], 1
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr i8, ptr [[A]], i64 [[ADD9]]
; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX10]], align 1
; CHECK-NEXT: [[CONV11:%.*]] = zext i8 [[TMP0]] to i32
; CHECK-NEXT: [[SHL12:%.*]] = shl i32 [[CONV11]], 2
; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL12]], [[SHL]]
-; CHECK-NEXT: [[B2:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT: [[B2:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV1]]
; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[B2]], align 1
; CHECK-NEXT: [[CONV15:%.*]] = zext i8 [[TMP1]] to i32
; CHECK-NEXT: [[OR16:%.*]] = or i32 [[OR]], [[CONV15]]
; CHECK-NEXT: [[SHL17:%.*]] = shl i32 [[OR16]], 2
-; CHECK-NEXT: [[CONV19:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-NEXT: [[CONV19:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-NEXT: [[ADD20:%.*]] = or i32 3, [[CONV19]]
; CHECK-NEXT: [[DEST_0:%.*]] = or i32 [[SHL17]], [[ADD20]]
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[DEST_0]], 2
@@ -49,9 +140,9 @@ define void @test_masked_interleave(ptr noalias %A, ptr noalias %B, ptr noalias
; CHECK-NEXT: store i8 [[CONV34]], ptr [[B]], align 1
; CHECK-NEXT: br label %[[LOOP_LATCH]]
; CHECK: [[LOOP_LATCH]]:
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 4
-; CHECK-NEXT: [[EC:%.*]] = icmp ugt i64 [[IV]], 1000
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 4
+; CHECK-NEXT: [[EC:%.*]] = icmp ugt i64...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/125094
More information about the llvm-commits
mailing list