[llvm] [VPlan] Don't narrow wide loads for scalable VFs when narrowing IGs. (PR #186181)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 13 08:31:22 PDT 2026
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/186181
>From 66e83dcbf91fa4289ae8acf64cfabb7c0c312e8d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 12 Mar 2026 16:50:19 +0000
Subject: [PATCH] [VPlan] Don't narrow wide loads for scalable VFs when
narrowing IGs.
For scalable VFs, the narrowed plan processes vscale iterations at once,
so a shared wide load cannot be narrowed to a uniform scalar; bail out,
as there currently is not way to create a narrowed load that loads
vscale elements.
Fixes https://github.com/llvm/llvm-project/issues/185860.
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 18 ++++++-----
...row-interleave-to-widen-memory-scalable.ll | 30 +++++++------------
2 files changed, 22 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 419c2f464a8ef..a44c2da216904 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5214,19 +5214,22 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
/// is defined at \p Idx of a load interleave group.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
- VPValue *OpV, unsigned Idx) {
+ VPValue *OpV, unsigned Idx, bool IsScalable) {
VPValue *Member0Op = WideMember0->getOperand(OpIdx);
VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
if (!Member0OpR)
return Member0Op == OpV;
if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
- return !W->getMask() && W->isConsecutive() && Member0Op == OpV;
+ // For scalable VFs, the narrowed plan processes vscale iterations at once,
+ // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
+ return !IsScalable && !W->getMask() && W->isConsecutive() &&
+ Member0Op == OpV;
if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
return false;
}
-static bool canNarrowOps(ArrayRef<VPValue *> Ops) {
+static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
SmallVector<VPValue *> Ops0;
auto *WideMember0 = dyn_cast<VPWidenRecipe>(Ops[0]);
if (!WideMember0)
@@ -5244,12 +5247,12 @@ static bool canNarrowOps(ArrayRef<VPValue *> Ops) {
for (VPValue *Op : Ops)
OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
- if (canNarrowOps(OpsI))
+ if (canNarrowOps(OpsI, IsScalable))
continue;
- if (any_of(enumerate(OpsI), [WideMember0, Idx](const auto &P) {
+ if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
const auto &[OpIdx, OpV] = P;
- return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx);
+ return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
}))
return false;
}
@@ -5458,7 +5461,8 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
// Check if all values feeding InterleaveR are matching wide recipes, which
// operands that can be narrowed.
- if (!canNarrowOps(InterleaveR->getStoredValues()))
+ if (!canNarrowOps(InterleaveR->getStoredValues(),
+ VFToOptimize->isScalable()))
return nullptr;
StoreGroups.push_back(InterleaveR);
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
index baa1f7ed85c1d..fff318850bef6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
@@ -409,41 +409,33 @@ exit:
}
; Shared wide load (scales) feeds all members of the store interleave group
-; through fmul.
-; FIXME: With scalable VF, we incorrectly narrow the shared load to a uniform
-; scalar which is incorrect because the loop step is vscale and we need vscale
-; distinct values.
+; through fmul. With scalable VF, we currently cannot narrow the shared load
+; because we would need to load vscale values.
; Test case for https://github.com/llvm/llvm-project/issues/185860.
define void @shared_wide_load_not_narrowed(ptr noalias %src, ptr noalias %scales, ptr noalias %dst, i64 %n) {
; CHECK-LABEL: define void @shared_wide_load_not_narrowed(
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[SCALES:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP2]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [16 x i8], ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [8 x i8], ptr [[SCALES]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[TMP4]], align 8
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP5]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = fmul <vscale x 2 x double> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [16 x i8], ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT: store <vscale x 2 x double> [[TMP6]], ptr [[TMP7]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP7]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT: br [[EXIT:label %.*]]
; CHECK: [[SCALAR_PH]]:
;
entry:
More information about the llvm-commits
mailing list