[llvm] 3e898bc - [LV] Fix cost misaligned when gather/scatter w/ addr is uniform. (#157387)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 10 16:49:58 PDT 2025
Author: Elvis Wang
Date: 2025-09-11T07:49:54+08:00
New Revision: 3e898bc40fc344b72cdf6b0ee75eb22b73ee840f
URL: https://github.com/llvm/llvm-project/commit/3e898bc40fc344b72cdf6b0ee75eb22b73ee840f
DIFF: https://github.com/llvm/llvm-project/commit/3e898bc40fc344b72cdf6b0ee75eb22b73ee840f.diff
LOG: [LV] Fix cost misaligned when gather/scatter w/ addr is uniform. (#157387)
This patch fix the assertion when the `isUniform` (from legacy model)
and `isSingleScalar`(from Vplan-based model) mismatch.
The simplify test that cause assertion
```
loop:
loadA = load %a => %a is loop invariant.
loadB = load %LoadA
...
```
In the legacy cost model, it cannot analysis that addr of `%loadB` is
uniform but in the Vplan-based cost model both addr in `%loadA` and
`loadB` is single scalar.
Full test caused crash: https://llvm.godbolt.org/z/zEG8YKjqh.
---------
Co-authored-by: Luke Lau <luke at igalia.com>
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3cff43a510298..b4acda80cfb93 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6907,6 +6907,16 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
if (isa<VPPartialReductionRecipe>(&R))
return true;
+ // The VPlan-based cost model can analyze if recipes are scalar
+ // recursively, but the legacy cost model cannot.
+ if (auto *WidenMemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
+ auto *AddrI = dyn_cast<Instruction>(
+ getLoadStorePointerOperand(&WidenMemR->getIngredient()));
+ if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
+ CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
+ return true;
+ }
+
/// If a VPlan transform folded a recipe to one producing a single-scalar,
/// but the original instruction wasn't uniform-after-vectorization in the
/// legacy cost model, the legacy cost overestimates the actual cost.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
index 4f91670e7751a..43165aa704626 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
@@ -184,3 +184,75 @@ loop:
exit:
ret void
}
+
+define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2, ptr %p3, i64 %N) {
+; CHECK-LABEL: @store_to_addr_generated_from_invariant_addr(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P0:%.*]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT: [[TMP2:%.*]] = mul <vscale x 2 x i64> [[TMP1]], splat (i64 1)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP2]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP0]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P1:%.*]], <vscale x 2 x i64> [[VEC_IND]]
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
+; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[P2:%.*]], align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[P3:%.*]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x ptr> align 1 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
+; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br label [[EXIT:%.*]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P1]], i64 [[IV]]
+; CHECK-NEXT: store ptr [[P0]], ptr [[ARRAYIDX11]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[P2]], align 4
+; CHECK-NEXT: [[BITS_TO_GO:%.*]] = getelementptr i8, ptr [[P3]], i64 [[TMP10]]
+; CHECK-NEXT: store i32 0, ptr [[BITS_TO_GO]], align 4
+; CHECK-NEXT: store i32 0, ptr [[BITS_TO_GO]], align 4
+; CHECK-NEXT: store i8 0, ptr [[BITS_TO_GO]], align 1
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx11 = getelementptr i32, ptr %p1, i64 %iv
+ store ptr %p0, ptr %arrayidx11, align 8
+ %0 = load i64, ptr %p2, align 4
+ %bits_to_go = getelementptr i8, ptr %p3, i64 %0
+ store i32 0, ptr %bits_to_go, align 4
+ store i32 0, ptr %bits_to_go, align 4
+ store i8 0, ptr %bits_to_go, align 1
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv, %N
+ br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+ ret void
+}
More information about the llvm-commits
mailing list