[llvm] [LAA] Only use inbounds/nusw in isNoWrap if the GEP is dereferenced. (PR #161445)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 27 21:21:07 PDT 2025
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/161445
>From 156b47f324bfefa58af6537b11803b55929b69ac Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 30 Sep 2025 22:12:58 +0100
Subject: [PATCH 1/5] [LAA] Only use inbounds/nusw in isNoWrap if the GEP is
dereferenced.
Update isNoWrap to only use the inbounds/nusw flags from GEPs that are
guaranteed to be dereferenced on every iteration. This fixes a case
where we incorrectly determine no dependence.
I think the issue is isolated to code that evaluates the resulting
AddRec at BTC, just using it the compute the distance between accesses
should still be fine; if the access does not execute in a given
iteration, there's no dependence in that iteration. But isolating the
code is not straight-forward, so be conservative for now. The practical
impact should be very minor (only one loop changed across a corpus with
27k modules from large C/C++ workloads.
Fixes https://github.com/llvm/llvm-project/issues/160912.
---
.../llvm/Analysis/LoopAccessAnalysis.h | 3 +-
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 48 +++++++++++++------
llvm/lib/Analysis/VectorUtils.cpp | 8 ++--
.../inbounds-gep-in-predicated-blocks.ll | 5 ++
.../RISCV/masked_gather_scatter.ll | 40 +++++++++++-----
.../x86-interleaved-accesses-masked-group.ll | 4 +-
6 files changed, 75 insertions(+), 33 deletions(-)
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 84b4ad7c1d5a9..ff050a5c99224 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -896,7 +896,8 @@ getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
const Loop *Lp,
const DenseMap<Value *, const SCEV *> &StridesMap =
DenseMap<Value *, const SCEV *>(),
- bool Assume = false, bool ShouldCheckWrap = true);
+ bool Assume = false, bool ShouldCheckWrap = true,
+ DominatorTree *DT = nullptr);
/// Returns the distance between the pointers \p PtrA and \p PtrB iff they are
/// compatible and it is possible to calculate the distance between them. This
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index e27a9b1c44014..b13739def06dc 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -806,11 +806,11 @@ class AccessAnalysis {
typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
- MemoryDepChecker::DepCandidates &DA,
+ DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
PredicatedScalarEvolution &PSE,
SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
- : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
- LoopAliasScopes(LoopAliasScopes) {
+ : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA),
+ PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
// We're analyzing dependences across loop iterations.
BAA.enableCrossIterationMode();
}
@@ -934,6 +934,8 @@ class AccessAnalysis {
/// The LoopInfo of the loop being checked.
const LoopInfo *LI;
+ DominatorTree &DT;
+
/// Sets of potentially dependent accesses - members of one set share an
/// underlying pointer. The set "CheckDeps" identfies which sets really need a
/// dependence check.
@@ -1015,7 +1017,8 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
/// informating from the IR pointer value to determine no-wrap.
static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
- std::optional<int64_t> Stride = std::nullopt) {
+ std::optional<int64_t> Stride = std::nullopt,
+ DominatorTree *DT = nullptr) {
// FIXME: This should probably only return true for NUW.
if (AR->getNoWrapFlags(SCEV::NoWrapMask))
return true;
@@ -1027,9 +1030,22 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
// the distance between the previously accessed location and the wrapped
// location will be larger than half the pointer index type space. In that
// case, the GEP would be poison and any memory access dependent on it would
- // be immediate UB when executed.
+ // be immediate UB when executed. The reasoning can only be applied if the
+ // pointer is dereferenced at least at the last iteration. For now, check if
+ // it is dereferenced in every iteration.
if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr);
- GEP && GEP->hasNoUnsignedSignedWrap())
+ GEP && GEP->hasNoUnsignedSignedWrap() &&
+ (L->getHeader() == L->getLoopLatch() ||
+ (any_of(GEP->users(), [L, DT](User *U) {
+ if (!isa<LoadInst, StoreInst>(U))
+ return false;
+ BasicBlock *UserBB = cast<Instruction>(U)->getParent();
+ if (DT && !LoopAccessInfo::blockNeedsPredication(UserBB, L, DT))
+ return true;
+ return UserBB == L->getHeader() ||
+ (L->getExitingBlock() == L->getLoopLatch() &&
+ UserBB == L->getLoopLatch());
+ }))))
return true;
if (!Stride)
@@ -1293,7 +1309,7 @@ bool AccessAnalysis::createCheckForAccess(
}
if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
- TheLoop, Assume))
+ TheLoop, Assume, std::nullopt, &DT))
return false;
}
@@ -1608,7 +1624,7 @@ std::optional<int64_t>
llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
const Loop *Lp,
const DenseMap<Value *, const SCEV *> &StridesMap,
- bool Assume, bool ShouldCheckWrap) {
+ bool Assume, bool ShouldCheckWrap, DominatorTree *DT) {
const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
if (PSE.getSE()->isLoopInvariant(PtrScev, Lp))
return 0;
@@ -1630,7 +1646,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
if (!ShouldCheckWrap || !Stride)
return Stride;
- if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride))
+ if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride, DT))
return Stride;
LLVM_DEBUG(
@@ -2047,10 +2063,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
BPtr->getType()->getPointerAddressSpace())
return MemoryDepChecker::Dependence::Unknown;
- std::optional<int64_t> StrideAPtr =
- getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true);
- std::optional<int64_t> StrideBPtr =
- getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true);
+ std::optional<int64_t> StrideAPtr = getPtrStride(
+ PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true, DT);
+ std::optional<int64_t> StrideBPtr = getPtrStride(
+ PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true, DT);
const SCEV *Src = PSE.getSCEV(APtr);
const SCEV *Sink = PSE.getSCEV(BPtr);
@@ -2627,7 +2643,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
}
MemoryDepChecker::DepCandidates DepCands;
- AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes);
+ AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE,
+ LoopAliasScopes);
// Holds the analyzed pointers. We don't want to call getUnderlyingObjects
// multiple times on the same object. If the ptr is accessed twice, once
@@ -2691,7 +2708,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
bool IsReadOnlyPtr = false;
Type *AccessTy = getLoadStoreType(LD);
if (Seen.insert({Ptr, AccessTy}).second ||
- !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) {
+ !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides, false,
+ true, DT)) {
++NumReads;
IsReadOnlyPtr = true;
}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 091d94843698c..f5948ac624779 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1388,8 +1388,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
// even without the transformation. The wrapping checks are therefore
// deferred until after we've formed the interleaved groups.
int64_t Stride =
- getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides,
- /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0);
+ getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides,
+ /*Assume=*/true, /*ShouldCheckWrap=*/false, DT)
+ .value_or(0);
const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size,
@@ -1644,7 +1645,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
Value *MemberPtr = getLoadStorePointerOperand(Member);
Type *AccessTy = getLoadStoreType(Member);
if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides,
- /*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0))
+ /*Assume=*/false, /*ShouldCheckWrap=*/true, DT)
+ .value_or(0))
return false;
LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
<< FirstOrLast
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
index 4c2a9c3f29f02..3c56dace4df1c 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
@@ -19,9 +19,14 @@ define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) {
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
+; CHECK-NEXT: Group GRP0:
+; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A))
+; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header>
+; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header>
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
+; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 89819f2be4967..1cbec47d72203 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -17,18 +17,33 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV32-LABEL: @foo4(
; RV32-NEXT: entry:
; RV32-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
+; RV32: vector.scevcheck:
+; RV32-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 128, i32 624)
+; RV32-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV32-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV32-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[MUL_RESULT]]
+; RV32-NEXT: [[TMP1:%.*]] = icmp ult ptr [[TMP0]], [[A]]
+; RV32-NEXT: [[TMP2:%.*]] = or i1 [[TMP1]], [[MUL_OVERFLOW]]
+; RV32-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 256, i32 624)
+; RV32-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
+; RV32-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
+; RV32-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[MUL_RESULT2]]
+; RV32-NEXT: [[TMP4:%.*]] = icmp ult ptr [[TMP3]], [[B]]
+; RV32-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW3]]
+; RV32-NEXT: [[TMP6:%.*]] = or i1 [[TMP2]], [[TMP5]]
+; RV32-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK1:%.*]]
; RV32: vector.memcheck:
-; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880
; RV32-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940
-; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752
-; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
-; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
+; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i32 79880
+; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i32 159752
+; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
+; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
; RV32-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; RV32-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
; RV32-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
; RV32-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
-; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; RV32: vector.ph:
; RV32-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
; RV32-NEXT: [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
@@ -43,25 +58,26 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]]
+; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
; RV32-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100)
; RV32-NEXT: [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 1)
; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
-; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]]
+; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]]
; RV32-NEXT: [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
; RV32-NEXT: [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3]], !noalias [[META5]]
; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
; RV32-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
; RV32-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; RV32: middle.block:
; RV32-NEXT: br label [[FOR_END:%.*]]
; RV32: scalar.ph:
+; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ]
; RV32-NEXT: br label [[FOR_BODY:%.*]]
; RV32: for.body:
-; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; RV32-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100
@@ -78,7 +94,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV32: for.inc:
; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
; RV32-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
+; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
; RV32: for.end:
; RV32-NEXT: ret void
;
@@ -146,7 +162,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
; RV64: for.inc:
; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
; RV64-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
+; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]]
; RV64: for.end:
; RV64-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index efcc0005acaa3..8a0f50dec0598 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -711,8 +711,8 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture
; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer
; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
; ENABLED_MASKED_STRIDED: vector.body:
-; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE16:%.*]] ]
+; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer
>From e9a52ac44d0ee9da363b74ee8327a4d1fdac1933 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 1 Oct 2025 22:38:55 +0100
Subject: [PATCH 2/5] !ifxup adjust comment, move check inside if
---
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 34 ++++----
.../x86-interleaved-accesses-masked-group.ll | 87 +++++++++++++++++--
2 files changed, 99 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index b13739def06dc..f9693cd2de0bb 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -934,6 +934,7 @@ class AccessAnalysis {
/// The LoopInfo of the loop being checked.
const LoopInfo *LI;
+ /// The dominator tree of the function.
DominatorTree &DT;
/// Sets of potentially dependent accesses - members of one set share an
@@ -1030,23 +1031,24 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
// the distance between the previously accessed location and the wrapped
// location will be larger than half the pointer index type space. In that
// case, the GEP would be poison and any memory access dependent on it would
- // be immediate UB when executed. The reasoning can only be applied if the
- // pointer is dereferenced at least at the last iteration. For now, check if
- // it is dereferenced in every iteration.
+ // be immediate UB when executed.
if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr);
- GEP && GEP->hasNoUnsignedSignedWrap() &&
- (L->getHeader() == L->getLoopLatch() ||
- (any_of(GEP->users(), [L, DT](User *U) {
- if (!isa<LoadInst, StoreInst>(U))
- return false;
- BasicBlock *UserBB = cast<Instruction>(U)->getParent();
- if (DT && !LoopAccessInfo::blockNeedsPredication(UserBB, L, DT))
- return true;
- return UserBB == L->getHeader() ||
- (L->getExitingBlock() == L->getLoopLatch() &&
- UserBB == L->getLoopLatch());
- }))))
- return true;
+ GEP && GEP->hasNoUnsignedSignedWrap()) {
+ // For the above reasoning to apply, the pointer must be dereferenced in
+ // every iteration.
+ if (L->getHeader() == L->getLoopLatch() ||
+ any_of(GEP->users(), [L, DT](User *U) {
+ if (!isa<LoadInst, StoreInst>(U))
+ return false;
+ BasicBlock *UserBB = cast<Instruction>(U)->getParent();
+ if (DT && !LoopAccessInfo::blockNeedsPredication(UserBB, L, DT))
+ return true;
+ return UserBB == L->getHeader() ||
+ (L->getExitingBlock() == L->getLoopLatch() &&
+ UserBB == L->getLoopLatch());
+ }))
+ return true;
+ }
if (!Stride)
Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE);
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index 8a0f50dec0598..f9570405ecabc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -716,12 +716,87 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture
; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul i32 [[INDEX]], 3
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]]
-; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr align 1 [[TMP3]], <24 x i1> [[TMP5]], <24 x i8> poison)
-; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nsw <8 x i32> [[VEC_IND]], splat (i32 3)
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP51]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; ENABLED_MASKED_STRIDED: pred.load.if:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i32 [[TMP5]]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = load i8, ptr [[TMP52]], align 1
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> poison, i8 [[TMP53]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]]
+; ENABLED_MASKED_STRIDED: pred.load.continue:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = phi <8 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP4]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; ENABLED_MASKED_STRIDED: pred.load.if3:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP11]]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]]
+; ENABLED_MASKED_STRIDED: pred.load.continue4:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP4]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
+; ENABLED_MASKED_STRIDED: pred.load.if5:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP17]]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]]
+; ENABLED_MASKED_STRIDED: pred.load.continue6:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP4]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
+; ENABLED_MASKED_STRIDED: pred.load.if7:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP23]]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]]
+; ENABLED_MASKED_STRIDED: pred.load.continue8:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP4]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
+; ENABLED_MASKED_STRIDED: pred.load.if9:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP29]]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]]
+; ENABLED_MASKED_STRIDED: pred.load.continue10:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP4]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; ENABLED_MASKED_STRIDED: pred.load.if11:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP35]]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]]
+; ENABLED_MASKED_STRIDED: pred.load.continue12:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP4]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
+; ENABLED_MASKED_STRIDED: pred.load.if13:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP41]]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]]
+; ENABLED_MASKED_STRIDED: pred.load.continue14:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP4]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16]]
+; ENABLED_MASKED_STRIDED: pred.load.if15:
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP47]]
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]]
+; ENABLED_MASKED_STRIDED: pred.load.continue16:
+; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP6]], <8 x i1> [[TMP4]])
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
>From a7f0b09cc386189f4bc83c422eba6951c4e8d567 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 26 Oct 2025 19:36:02 +0000
Subject: [PATCH 3/5] !fixup address comments, thanks
---
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 6 +++---
.../inbounds-gep-in-predicated-blocks.ll | 12 +++++++++++-
2 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index f9693cd2de0bb..4cdafa3652744 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1037,8 +1037,8 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
// For the above reasoning to apply, the pointer must be dereferenced in
// every iteration.
if (L->getHeader() == L->getLoopLatch() ||
- any_of(GEP->users(), [L, DT](User *U) {
- if (!isa<LoadInst, StoreInst>(U))
+ any_of(GEP->users(), [L, DT,GEP](User *U) {
+ if (getLoadStorePointerOperand(U) != GEP)
return false;
BasicBlock *UserBB = cast<Instruction>(U)->getParent();
if (DT && !LoopAccessInfo::blockNeedsPredication(UserBB, L, DT))
@@ -1311,7 +1311,7 @@ bool AccessAnalysis::createCheckForAccess(
}
if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
- TheLoop, Assume, std::nullopt, &DT))
+ TheLoop, Assume, {}, &DT))
return false;
}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
index 3c56dace4df1c..d90a97f1651e6 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
@@ -10,7 +10,7 @@
; s0 += (1ULL << 62) + 1;
; s1 += (1ULL << 62) + 2;
; }
-; FIXME: We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine
+; We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine
; there are no dependences), as the pointers are not dereferenced in all loop iterations.
define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) {
; CHECK-LABEL: 'test_inbounds_gep_used_in_predicated_block'
@@ -68,9 +68,14 @@ define void @test_inbounds_gep_used_in_predicated_block_stored_value_operand(ptr
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
+; CHECK-NEXT: Group GRP0:
+; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A))
+; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header>
+; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header>
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were found in loop.
; CHECK-NEXT: SCEV assumptions:
+; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
;
@@ -114,9 +119,14 @@ define void @test_inbounds_gep_used_in_predicated_block_non_memop_user(ptr %A, i
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
+; CHECK-NEXT: Group GRP0:
+; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A))
+; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header>
+; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header>
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
+; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
;
>From 013ed3196d9eddb4fb6022ee3f5b6071b03cb578 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 26 Oct 2025 19:47:53 +0000
Subject: [PATCH 4/5] !fixup make DT non-optional
---
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 4cdafa3652744..18579e9bbdba4 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1018,8 +1018,8 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
/// informating from the IR pointer value to determine no-wrap.
static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
- std::optional<int64_t> Stride = std::nullopt,
- DominatorTree *DT = nullptr) {
+ const DominatorTree &DT,
+ std::optional<int64_t> Stride = std::nullopt) {
// FIXME: This should probably only return true for NUW.
if (AR->getNoWrapFlags(SCEV::NoWrapMask))
return true;
@@ -1037,15 +1037,11 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
// For the above reasoning to apply, the pointer must be dereferenced in
// every iteration.
if (L->getHeader() == L->getLoopLatch() ||
- any_of(GEP->users(), [L, DT,GEP](User *U) {
+ any_of(GEP->users(), [L, &DT, GEP](User *U) {
if (getLoadStorePointerOperand(U) != GEP)
return false;
BasicBlock *UserBB = cast<Instruction>(U)->getParent();
- if (DT && !LoopAccessInfo::blockNeedsPredication(UserBB, L, DT))
- return true;
- return UserBB == L->getHeader() ||
- (L->getExitingBlock() == L->getLoopLatch() &&
- UserBB == L->getLoopLatch());
+ return !LoopAccessInfo::blockNeedsPredication(UserBB, L, &DT);
}))
return true;
}
@@ -1311,7 +1307,7 @@ bool AccessAnalysis::createCheckForAccess(
}
if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
- TheLoop, Assume, {}, &DT))
+ TheLoop, Assume, DT))
return false;
}
@@ -1648,7 +1644,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
if (!ShouldCheckWrap || !Stride)
return Stride;
- if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride, DT))
+ if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, *DT, Stride))
return Stride;
LLVM_DEBUG(
>From 52f85d60d29a00e620ada0d881dd2c4328c792c2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 28 Oct 2025 04:14:54 +0000
Subject: [PATCH 5/5] !fixup update getPtrstride to take DT as const reference
---
llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 5 ++---
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 14 +++++++-------
llvm/lib/Analysis/VectorUtils.cpp | 8 ++++----
.../Target/AArch64/AArch64TargetTransformInfo.cpp | 10 ++++++----
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 9 ++++++---
llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp | 14 +++++++-------
.../Vectorize/LoopVectorizationLegality.cpp | 2 +-
7 files changed, 33 insertions(+), 29 deletions(-)
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index ff050a5c99224..c85ef3e131068 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -893,11 +893,10 @@ replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
/// result of this function is undefined.
LLVM_ABI std::optional<int64_t>
getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
- const Loop *Lp,
+ const Loop *Lp, const DominatorTree &DT,
const DenseMap<Value *, const SCEV *> &StridesMap =
DenseMap<Value *, const SCEV *>(),
- bool Assume = false, bool ShouldCheckWrap = true,
- DominatorTree *DT = nullptr);
+ bool Assume = false, bool ShouldCheckWrap = true);
/// Returns the distance between the pointers \p PtrA and \p PtrB iff they are
/// compatible and it is possible to calculate the distance between them. This
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 18579e9bbdba4..5d88e5f54e3d6 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1620,9 +1620,9 @@ void AccessAnalysis::processMemAccesses() {
/// Check whether the access through \p Ptr has a constant stride.
std::optional<int64_t>
llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
- const Loop *Lp,
+ const Loop *Lp, const DominatorTree &DT,
const DenseMap<Value *, const SCEV *> &StridesMap,
- bool Assume, bool ShouldCheckWrap, DominatorTree *DT) {
+ bool Assume, bool ShouldCheckWrap) {
const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
if (PSE.getSE()->isLoopInvariant(PtrScev, Lp))
return 0;
@@ -1644,7 +1644,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
if (!ShouldCheckWrap || !Stride)
return Stride;
- if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, *DT, Stride))
+ if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, DT, Stride))
return Stride;
LLVM_DEBUG(
@@ -2062,9 +2062,9 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
return MemoryDepChecker::Dependence::Unknown;
std::optional<int64_t> StrideAPtr = getPtrStride(
- PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true, DT);
+ PSE, ATy, APtr, InnermostLoop, *DT, SymbolicStrides, true, true);
std::optional<int64_t> StrideBPtr = getPtrStride(
- PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true, DT);
+ PSE, BTy, BPtr, InnermostLoop, *DT, SymbolicStrides, true, true);
const SCEV *Src = PSE.getSCEV(APtr);
const SCEV *Sink = PSE.getSCEV(BPtr);
@@ -2706,8 +2706,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
bool IsReadOnlyPtr = false;
Type *AccessTy = getLoadStoreType(LD);
if (Seen.insert({Ptr, AccessTy}).second ||
- !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides, false,
- true, DT)) {
+ !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false,
+ true)) {
++NumReads;
IsReadOnlyPtr = true;
}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index f5948ac624779..150e43eaadb94 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1388,8 +1388,8 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
// even without the transformation. The wrapping checks are therefore
// deferred until after we've formed the interleaved groups.
int64_t Stride =
- getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides,
- /*Assume=*/true, /*ShouldCheckWrap=*/false, DT)
+ getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides,
+ /*Assume=*/true, /*ShouldCheckWrap=*/false)
.value_or(0);
const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
@@ -1644,8 +1644,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
assert(Member && "Group member does not exist");
Value *MemberPtr = getLoadStorePointerOperand(Member);
Type *AccessTy = getLoadStoreType(Member);
- if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides,
- /*Assume=*/false, /*ShouldCheckWrap=*/true, DT)
+ if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides,
+ /*Assume=*/false, /*ShouldCheckWrap=*/true)
.value_or(0))
return false;
LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index fede586cf35bc..63ce378165200 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6122,7 +6122,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
}
static bool containsDecreasingPointers(Loop *TheLoop,
- PredicatedScalarEvolution *PSE) {
+ PredicatedScalarEvolution *PSE,
+ const DominatorTree &DT) {
const auto &Strides = DenseMap<Value *, const SCEV *>();
for (BasicBlock *BB : TheLoop->blocks()) {
// Scan the instructions in the block and look for addresses that are
@@ -6131,8 +6132,8 @@ static bool containsDecreasingPointers(Loop *TheLoop,
if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
Value *Ptr = getLoadStorePointerOperand(&I);
Type *AccessTy = getLoadStoreType(&I);
- if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
- /*ShouldCheckWrap=*/false)
+ if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
+ /*Assume=*/true, /*ShouldCheckWrap=*/false)
.value_or(0) < 0)
return true;
}
@@ -6177,7 +6178,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
// negative strides. This will require extra work to reverse the loop
// predicate, which may be expensive.
if (containsDecreasingPointers(TFI->LVL->getLoop(),
- TFI->LVL->getPredicatedScalarEvolution()))
+ TFI->LVL->getPredicatedScalarEvolution(),
+ *TFI->LVL->getDominatorTree()))
Required |= TailFoldingOpts::Reverse;
if (Required == TailFoldingOpts::Disabled)
Required |= TailFoldingOpts::Simple;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 9b250e6cac3ab..608088625cb24 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2448,7 +2448,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
//
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
const DataLayout &DL,
- const LoopAccessInfo *LAI) {
+ const LoopAccessInfo *LAI,
+ const DominatorTree &DT) {
LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
// If there are live-out values, it is probably a reduction. We can predicate
@@ -2498,7 +2499,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
Value *Ptr = getLoadStorePointerOperand(&I);
Type *AccessTy = getLoadStoreType(&I);
- int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
+ int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L,
+ DT).value_or(0);
if (NextStride == 1) {
// TODO: for now only allow consecutive strides of 1. We could support
// other strides as long as it is uniform, but let's keep it simple
@@ -2585,7 +2587,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
return false;
}
- return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
+ return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),
+ *LVL->getDominatorTree());
}
TailFoldingStyle
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index a8839981e5478..38fde5333ca92 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate {
/// Return true if the dependence from the store to the load has an
/// absolute distance of one.
/// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop)
- bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
- Loop *L) const {
+ bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L,
+ const DominatorTree &DT) const {
Value *LoadPtr = Load->getPointerOperand();
Value *StorePtr = Store->getPointerOperand();
Type *LoadType = getLoadStoreType(Load);
@@ -102,8 +102,8 @@ struct StoreToLoadForwardingCandidate {
DL.getTypeSizeInBits(getLoadStoreType(Store)) &&
"Should be a known dependence");
- int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0);
- int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0);
+ int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0);
+ int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0);
if (!StrideLoad || !StrideStore || StrideLoad != StrideStore)
return false;
@@ -287,8 +287,8 @@ class LoadEliminationForLoop {
// so deciding which one forwards is easy. The later one forwards as
// long as they both have a dependence distance of one to the load.
if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
- Cand.isDependenceDistanceOfOne(PSE, L) &&
- OtherCand->isDependenceDistanceOfOne(PSE, L)) {
+ Cand.isDependenceDistanceOfOne(PSE, L, *DT) &&
+ OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) {
// They are in the same block, the later one will forward to the load.
if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
OtherCand = &Cand;
@@ -538,7 +538,7 @@ class LoadEliminationForLoop {
// Check whether the SCEV difference is the same as the induction step,
// thus we load the value in the next iteration.
- if (!Cand.isDependenceDistanceOfOne(PSE, L))
+ if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT))
continue;
assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index fdfff16132093..bec90ff91c2bf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -462,7 +462,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
bool CanAddPredicate = !llvm::shouldOptimizeForSize(
TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
- int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides,
+ int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides,
CanAddPredicate, false).value_or(0);
if (Stride == 1 || Stride == -1)
return Stride;
More information about the llvm-commits
mailing list