[llvm] [VPlan] Allow recursive narrowing in interleave group narrowing. (PR #167310)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 10 04:55:37 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-vectorizers
@llvm/pr-subscribers-llvm-transforms
Author: Florian Hahn (fhahn)
<details>
<summary>Changes</summary>
This allows canNarrowOps to recursively check if operands can be
narrowed, enabling narrowing of longer chains of operations that
feed interleave groups.
Depends on https://github.com/llvm/llvm-project/pull/167309 (included in the PR)
---
Full diff: https://github.com/llvm/llvm-project/pull/167310.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+30-15)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-chained.ll (+29-53)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 634df51a12965..d6603231f9205 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4142,6 +4142,34 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
return false;
}
+static bool canNarrowOps(ArrayRef<VPValue *> Ops) {
+ SmallVector<VPValue *> Ops0;
+ auto *WideMember0 = dyn_cast<VPWidenRecipe>(Ops[0]);
+ if (!WideMember0)
+ return false;
+
+ for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
+ SmallVector<VPValue *> Ops0;
+ for (const auto &[I, V] : enumerate(Ops)) {
+ auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
+ if (!R || R->getOpcode() != WideMember0->getOpcode() ||
+ R->getNumOperands() > 2)
+ return false;
+
+ Ops0.push_back(R->getOperand(Idx));
+ }
+ if (canNarrowOps(Ops0))
+ continue;
+ if (any_of(enumerate(Ops0), [WideMember0, Idx](const auto &P) {
+ const auto &[OpIdx, OpV] = P;
+ return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx);
+ }))
+ return false;
+ }
+
+ return true;
+}
+
/// Returns true if \p IR is a full interleave group with factor and number of
/// members both equal to \p VF. The interleave group must also access the full
/// vector width \p VectorRegWidth.
@@ -4313,22 +4341,9 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// Check if all values feeding InterleaveR are matching wide recipes, which
// operands that can be narrowed.
- auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
- InterleaveR->getStoredValues()[0]->getDefiningRecipe());
- if (!WideMember0)
+ //
+ if (!canNarrowOps(InterleaveR->getStoredValues()))
return;
- for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
- auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
- if (!R || R->getOpcode() != WideMember0->getOpcode() ||
- R->getNumOperands() > 2)
- return;
- if (any_of(enumerate(R->operands()),
- [WideMember0, Idx = I](const auto &P) {
- const auto &[OpIdx, OpV] = P;
- return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
- }))
- return;
- }
StoreGroups.push_back(InterleaveR);
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-chained.ll
index 23bc21a49a8b3..75e363a32fd24 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-chained.ll
@@ -14,20 +14,16 @@ define void @test_2xi64_mul_add(ptr noalias %data, ptr noalias %factor) {
; VF2: [[VECTOR_BODY]]:
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP0]], align 8
+; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
-; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
-; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
; VF2-NEXT: [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
; VF2-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], splat (i64 2)
-; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: store <2 x i64> [[TMP6]], ptr [[TMP2]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
@@ -188,20 +184,16 @@ define void @test_2xi64_mul_sub(ptr noalias %data, ptr noalias %factor) {
; VF2: [[VECTOR_BODY]]:
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP0]], align 8
+; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
-; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
-; VF2-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP3]], splat (i64 2)
+; VF2-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
; VF2-NEXT: [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
; VF2-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP5]], splat (i64 2)
-; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: store <2 x i64> [[TMP6]], ptr [[TMP2]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
@@ -304,20 +296,15 @@ define void @test_2xi64_mul_sub_mismatched_ops2(ptr noalias %data, ptr noalias %
; VF2: [[VECTOR_BODY]]:
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0]], align 8
+; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
-; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], splat (i64 3)
; VF2-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP3]], splat (i64 2)
-; VF2-NEXT: [[TMP5:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; VF2-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP5]], splat (i64 2)
-; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: store <2 x i64> [[TMP4]], ptr [[TMP2]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
@@ -420,22 +407,17 @@ define void @test_2xi64_mul_add_xor(ptr noalias %data, ptr noalias %factor) {
; VF2: [[VECTOR_BODY]]:
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP0]], align 8
+; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
-; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]]
-; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2)
-; VF2-NEXT: [[TMP5:%.*]] = xor <2 x i64> splat (i64 4), [[TMP4]]
+; VF2-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
; VF2-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
; VF2-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], splat (i64 2)
; VF2-NEXT: [[TMP8:%.*]] = xor <2 x i64> splat (i64 4), [[TMP7]]
-; VF2-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: store <2 x i64> [[TMP8]], ptr [[TMP2]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
@@ -606,22 +588,16 @@ define void @test_2xi64_mul_add_xor_mismatched_ops(ptr noalias %data, ptr noalia
; VF2: [[VECTOR_BODY]]:
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
+; VF2-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP0]], align 8
+; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
+; VF2-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
-; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
; VF2-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[WIDE_LOAD]], splat (i64 3)
-; VF2-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2)
-; VF2-NEXT: [[TMP5:%.*]] = xor <2 x i64> splat (i64 4), [[TMP4]]
-; VF2-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
-; VF2-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], splat (i64 2)
+; VF2-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP3]], splat (i64 2)
; VF2-NEXT: [[TMP8:%.*]] = xor <2 x i64> splat (i64 4), [[TMP7]]
-; VF2-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT: store <2 x i64> [[TMP8]], ptr [[TMP2]], align 8
+; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
; VF2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; VF2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; VF2: [[MIDDLE_BLOCK]]:
``````````
</details>
https://github.com/llvm/llvm-project/pull/167310
More information about the llvm-commits
mailing list