[llvm-branch-commits] [llvm] release/20.x: Reland "[LV]: Teach LV to recursively (de)interleave." (#125094) (PR #128389)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sat Feb 22 19:07:13 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-vectorizers
Author: None (llvmbot)
<details>
<summary>Changes</summary>
Backport e9a20f77ee2117b4a6eb40826b7280e29ad29e1e
Requested by: @<!-- -->hassnaaHamdi
---
Patch is 194.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128389.diff
6 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+7-7)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+56-23)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll (+259-1)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll (+252)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll (+678-640)
- (added) llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll (+135)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0ceeec48487f6..2ac58feaf39bb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3390,10 +3390,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
if (hasIrregularType(ScalarTy, DL))
return false;
- // We currently only know how to emit interleave/deinterleave with
- // Factor=2 for scalable vectors. This is purely an implementation
- // limit.
- if (VF.isScalable() && InterleaveFactor != 2)
+ // For scalable vectors, the only interleave factor currently supported
+ // must be power of 2 since we require the (de)interleave2 intrinsics
+ // instead of shufflevectors.
+ if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
return false;
// If the group involves a non-integral pointer, we may not be able to
@@ -9259,9 +9259,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
CM.getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
// For scalable vectors, the only interleave factor currently supported
- // is 2 since we require the (de)interleave2 intrinsics instead of
- // shufflevectors.
- assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
+ // must be power of 2 since we require the (de)interleave2 intrinsics
+ // instead of shufflevectors.
+ assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
"Unsupported interleave factor for scalable vectors");
return Result;
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 97be2da24fc37..4f13116382796 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2892,10 +2892,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
- VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
- return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
- Vals,
- /*FMFSource=*/nullptr, Name);
+ assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
+ "scalable vectors, must be power of 2");
+ SmallVector<Value *> InterleavingValues(Vals);
+ // When interleaving, the number of values will be shrunk until we have the
+ // single final interleaved value.
+ auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType());
+ for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
+ InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
+ for (unsigned I = 0; I < Midpoint; ++I)
+ InterleavingValues[I] = Builder.CreateIntrinsic(
+ InterleaveTy, Intrinsic::vector_interleave2,
+ {InterleavingValues[I], InterleavingValues[Midpoint + I]},
+ /*FMFSource=*/nullptr, Name);
+ }
+ return InterleavingValues[0];
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2981,15 +2992,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
&InterleaveFactor](Value *MaskForGaps) -> Value * {
if (State.VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
- assert(InterleaveFactor == 2 &&
+ assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
auto *ResBlockInMask = State.get(BlockInMask);
- SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
- auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
- State.VF.getKnownMinValue() * 2, true);
- return State.Builder.CreateIntrinsic(
- MaskTy, Intrinsic::vector_interleave2, Ops,
- /*FMFSource=*/nullptr, "interleaved.mask");
+ SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
+ return interleaveVectors(State.Builder, Ops, "interleaved.mask");
}
if (!BlockInMask)
@@ -3029,22 +3036,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
ArrayRef<VPValue *> VPDefs = definedValues();
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
if (VecTy->isScalableTy()) {
- assert(InterleaveFactor == 2 &&
+ assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
- // Scalable vectors cannot use arbitrary shufflevectors (only splats),
- // so must use intrinsics to deinterleave.
- Value *DI = State.Builder.CreateIntrinsic(
- Intrinsic::vector_deinterleave2, VecTy, NewLoad,
- /*FMFSource=*/nullptr, "strided.vec");
- unsigned J = 0;
- for (unsigned I = 0; I < InterleaveFactor; ++I) {
- Instruction *Member = Group->getMember(I);
+ // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+ // so must use intrinsics to deinterleave.
+ SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
+ DeinterleavedValues[0] = NewLoad;
+ // For the case of InterleaveFactor > 2, we will have to do recursive
+ // deinterleaving, because the current available deinterleave intrinsic
+ // supports only Factor of 2, otherwise it will bailout after first
+ // iteration.
+ // When deinterleaving, the number of values will double until we
+ // have "InterleaveFactor".
+ for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
+ NumVectors *= 2) {
+ // Deinterleave the elements within the vector
+ SmallVector<Value *> TempDeinterleavedValues(NumVectors);
+ for (unsigned I = 0; I < NumVectors; ++I) {
+ auto *DiTy = DeinterleavedValues[I]->getType();
+ TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
+ Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
+ /*FMFSource=*/nullptr, "strided.vec");
+ }
+ // Extract the deinterleaved values:
+ for (unsigned I = 0; I < 2; ++I)
+ for (unsigned J = 0; J < NumVectors; ++J)
+ DeinterleavedValues[NumVectors * I + J] =
+ State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
+ }
- if (!Member)
+#ifndef NDEBUG
+ for (Value *Val : DeinterleavedValues)
+ assert(Val && "NULL Deinterleaved Value");
+#endif
+ for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
+ Value *StridedVec = DeinterleavedValues[I];
+ if (!Member) {
+ // This value is not needed as it's not used
+ cast<Instruction>(StridedVec)->eraseFromParent();
continue;
-
- Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
+ }
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index bf95622733461..05c0bc0761ea4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
; CHECK-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
@@ -1548,5 +1548,263 @@ end:
ret void
}
+; Check vectorization on an interleaved load/store groups of factor 4
+
+; for (int i = 0; i < 1024; ++i) {
+; dst[i].x = a[i].x + b[i].x;
+; dst[i].y = a[i].y - b[i].y;
+; dst[i].z = a[i].z << b[i].z;
+; dst[i].t = a[i].t >> b[i].t;
+; }
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) {
+; CHECK-LABEL: @interleave_deinterleave(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP7]])
+; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <vscale x 16 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
+; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 1
+; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
+; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP15]])
+; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 0
+; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 0
+; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 1
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 1
+; CHECK-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
+; CHECK-NEXT: [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
+; CHECK-NEXT: [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP23]])
+; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP24]])
+; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC12]])
+; CHECK-NEXT: store <vscale x 16 x i32> [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
+; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
+; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4
+; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
+; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4
+; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
+; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Z]], align 4
+; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
+; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4
+; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
+; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4
+; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
+; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[T]], align 4
+; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
+; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[T27]], align 4
+; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
+; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx2, align 4
+ %add = add nsw i32 %1, %0
+ %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv
+ store i32 %add, ptr %arrayidx5, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4
+ %2 = load i32, ptr %y, align 4
+ %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4
+ %3 = load i32, ptr %y11, align 4
+ %sub = sub nsw i32 %2, %3
+ %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4
+ store i32 %sub, ptr %y14, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
+ %4 = load i32, ptr %z, align 4
+ %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8
+ %5 = load i32, ptr %z19, align 4
+ %shl = shl i32 %4, %5
+ %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8
+ store i32 %shl, ptr %z22, align 4
+ %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12
+ %6 = load i32, ptr %t, align 4
+ %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12
+ %7 = load i32, ptr %t27, align 4
+ %shr = ashr i32 %6, %7
+ %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12
+ store i32 %shr, ptr %t30, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; Check vectorization on a reverse interleaved load/store groups of factor 4
+
+; for (int i = 1023; i >= 0; i--) {
+; int a = A[i].x + i;
+; int b = A[i].y - i;
+; int c = A[i].z * i;
+; int d = A[i].t << i;
+; B[i].x = a;
+; B[i].y = b;
+; B[i].z = c;
+; B[i].t = d;
+; }
+
+define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{
+; CHECK-LABEL: @interleave_deinterleave_reverse(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT: [[INDUCTION:%.*]] = sub <vscale x 4 x i32> splat (i32 1023), [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; CHECK-NE...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/128389
More information about the llvm-branch-commits
mailing list