[llvm] [WIP][LoopVectorize] Perform loop versioning for some early exit loops (PR #120603)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 19 08:34:01 PST 2024
https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/120603
None
>From e0e8ca8b1bd4d09d611410b8067d41ba00e9054e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 19 Dec 2024 16:31:51 +0000
Subject: [PATCH] [WIP][LoopVectorize] Perform loop versioning for some early
exit loops
---
llvm/include/llvm/Analysis/Loads.h | 3 +-
.../Vectorize/LoopVectorizationLegality.h | 19 +++++
llvm/lib/Analysis/Loads.cpp | 21 +++--
.../Vectorize/LoopVectorizationLegality.cpp | 52 +++++++++++--
.../Transforms/Vectorize/LoopVectorize.cpp | 53 +++++++++++--
.../LoopVectorize/early_exit_legality.ll | 6 +-
.../single_early_exit_unsafe_ptrs.ll | 77 ++++++++++++++++++-
7 files changed, 207 insertions(+), 24 deletions(-)
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 639070c07897b0..828f64e0e59432 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -92,7 +92,8 @@ bool isDereferenceableAndAlignedInLoop(
/// contains read-only memory accesses.
bool isDereferenceableReadOnlyLoop(
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
- SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
+ SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr,
+ SmallVectorImpl<LoadInst *> *NonDerefLoads = nullptr);
/// Return true if we know that executing a load from this value cannot trap.
///
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index fbe80eddbae07a..7f6f8f9c3f5bfc 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -425,6 +425,19 @@ class LoopVectorizationLegality {
unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }
+ /// Return the number of loads in the loop we have to consider that could
+ /// potentially fault in a loop with uncountable early exits.
+ unsigned getNumPotentiallyFaultingLoads() const {
+ return PotentiallyFaultingLoads.size();
+ }
+
+ /// Return a vector of all potentially faulting loads in a loop with
+ /// uncountable early exits.
+ const SmallVectorImpl<std::pair<LoadInst *, const SCEV *>> *
+ getPotentiallyFaultingLoads() const {
+ return &PotentiallyFaultingLoads;
+ }
+
/// Returns a HistogramInfo* for the given instruction if it was determined
/// to be part of a load -> update -> store sequence where multiple lanes
/// may be working on the same memory address.
@@ -533,6 +546,8 @@ class LoopVectorizationLegality {
/// additional cases safely.
bool isVectorizableEarlyExitLoop();
+ bool analyzePotentiallyFaultingLoads(SmallVectorImpl<LoadInst *> *Loads);
+
/// Return true if all of the instructions in the block can be speculatively
/// executed, and record the loads/stores that require masking.
/// \p SafePtrs is a list of addresses that are known to be legal and we know
@@ -656,6 +671,10 @@ class LoopVectorizationLegality {
/// Keep track of the destinations of all uncountable exits if the
/// exact backedge taken count is not computable.
SmallVector<BasicBlock *, 4> UncountableExitBlocks;
+
+ /// Keep a record of all potentially faulting loads in loops with
+ /// uncountable early exits.
+ SmallVector<std::pair<LoadInst *, const SCEV *>, 4> PotentiallyFaultingLoads;
};
} // namespace llvm
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 54b9521fda8fd2..7818dcf84278fa 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -808,15 +808,26 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To,
bool llvm::isDereferenceableReadOnlyLoop(
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
- SmallVectorImpl<const SCEVPredicate *> *Predicates) {
+ SmallVectorImpl<const SCEVPredicate *> *Predicates,
+ SmallVectorImpl<LoadInst *> *NonDerefLoads) {
+ bool Result = true;
for (BasicBlock *BB : L->blocks()) {
for (Instruction &I : *BB) {
if (auto *LI = dyn_cast<LoadInst>(&I)) {
- if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates))
+ if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC,
+ Predicates)) {
+ if (!NonDerefLoads)
+ return false;
+ NonDerefLoads->push_back(LI);
+ Result = false;
+ }
+ } else if (I.mayReadFromMemory() || I.mayWriteToMemory() ||
+ I.mayThrow()) {
+ if (!NonDerefLoads)
return false;
- } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
- return false;
+ Result = false;
+ }
}
}
- return true;
+ return Result;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 555c8435dd330d..e7cd6c433476a1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1589,6 +1589,39 @@ bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
return Result;
}
+bool LoopVectorizationLegality::analyzePotentiallyFaultingLoads(
+ SmallVectorImpl<LoadInst *> *Loads) {
+ LLVM_DEBUG(dbgs() << "Found potentially faulting loads in loop with "
+ "uncountable early exit:\n");
+ for (LoadInst *LI : *Loads) {
+ LLVM_DEBUG(dbgs() << "Load: " << *LI << '\n');
+ Value *Ptr = LI->getPointerOperand();
+ if (!Ptr)
+ return false;
+ const SCEV *PtrExpr = PSE.getSCEV(Ptr);
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
+ // TODO: Deal with loop invariant pointers.
+ if (!AR || AR->getLoop() != TheLoop || !AR->isAffine())
+ return false;
+ auto Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*PSE.getSE()));
+ if (!Step)
+ return false;
+ const SCEV *Start = AR->getStart();
+
+ // Make sure the step is positive and matches the object size in memory.
+ // TODO: Extend this to cover more cases.
+ auto &DL = LI->getDataLayout();
+ APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
+ DL.getTypeStoreSize(LI->getType()).getFixedValue());
+ if (EltSize != Step->getAPInt())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "SCEV for Load Ptr: " << *Start << 'n');
+ PotentiallyFaultingLoads.push_back({LI, Start});
+ }
+ return true;
+}
+
bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
BasicBlock *LatchBB = TheLoop->getLoopLatch();
if (!LatchBB) {
@@ -1713,15 +1746,18 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
assert(LatchBB->getUniquePredecessor() == getUncountableEarlyExitingBlock() &&
"Expected latch predecessor to be the early exiting block");
- // TODO: Handle loops that may fault.
Predicates.clear();
- if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
- &Predicates)) {
- reportVectorizationFailure(
- "Loop may fault",
- "Cannot vectorize potentially faulting early exit loop",
- "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
- return false;
+ SmallVector<LoadInst *, 4> Loads;
+ if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, &Predicates,
+ &Loads)) {
+ if (!analyzePotentiallyFaultingLoads(&Loads)) {
+ reportVectorizationFailure(
+ "Loop may fault",
+ "Cannot vectorize potentially faulting early exit loop",
+ "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "We can vectorize the loop with runtime checks.\n");
}
[[maybe_unused]] const SCEV *SymbolicMaxBTC =
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a8511483e00fbe..a593259506cdd3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2135,6 +2135,29 @@ class GeneratedRTChecks {
};
} // namespace
+static void addPointerAlignmentChecks(
+ const SmallVectorImpl<std::pair<LoadInst *, const SCEV *>> *Loads,
+ PredicatedScalarEvolution &PSE, ElementCount VF) {
+ ScalarEvolution *SE = PSE.getSE();
+ const DataLayout &DL = SE->getDataLayout();
+ Type *PtrIntType = DL.getIntPtrType(SE->getContext());
+
+ const SCEV *Zero = SE->getZero(PtrIntType);
+ const SCEV *ScevEC = SE->getElementCount(PtrIntType, VF);
+
+ for (auto Load : *Loads) {
+ APInt EltSize(
+ DL.getIndexTypeSizeInBits(Load.first->getPointerOperandType()),
+ DL.getTypeStoreSize(Load.first->getType()).getFixedValue());
+ const SCEV *Start = SE->getPtrToIntExpr(Load.second, PtrIntType);
+ const SCEV *Align =
+ SE->getMulExpr(ScevEC, SE->getConstant(EltSize),
+ (SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW));
+ const SCEV *Rem = SE->getURemExpr(Start, Align);
+ PSE.addPredicate(*(SE->getEqualPredicate(Rem, Zero)));
+ }
+}
+
static bool useActiveLaneMask(TailFoldingStyle Style) {
return Style == TailFoldingStyle::Data ||
Style == TailFoldingStyle::DataAndControlFlow ||
@@ -10236,11 +10259,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
- reportVectorizationFailure("Auto-vectorization of loops with uncountable "
- "early exit is not enabled",
- "UncountableEarlyExitLoopsDisabled", ORE, L);
- return false;
+ if (LVL.hasUncountableEarlyExit()) {
+ if (!EnableEarlyExitVectorization) {
+ reportVectorizationFailure("Auto-vectorization of loops with uncountable "
+ "early exit is not enabled",
+ "UncountableEarlyExitLoopsDisabled", ORE, L);
+ return false;
+ }
+
+ unsigned NumPotentiallyFaultingPointers =
+ LVL.getNumPotentiallyFaultingLoads();
+ if (NumPotentiallyFaultingPointers > 1) {
+ reportVectorizationFailure("Not worth vectorizing loop with uncountable "
+ "early exit, due to number of potentially "
+ "faulting loads",
+ "UncountableEarlyExitMayFault", ORE, L);
+ return false;
+ } else if (NumPotentiallyFaultingPointers)
+ LLVM_DEBUG(dbgs() << "LV: Need to version early-exit vector loop with"
+ << "pointer alignment checks.\n");
}
// Entrance to the VPlan-native vectorization path. Outer loops are processed
@@ -10391,8 +10428,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
unsigned SelectedIC = std::max(IC, UserIC);
// Optimistically generate runtime checks if they are needed. Drop them if
// they turn out to not be profitable.
- if (VF.Width.isVector() || SelectedIC > 1)
+ if (VF.Width.isVector() || SelectedIC > 1) {
+ if (LVL.getNumPotentiallyFaultingLoads())
+ addPointerAlignmentChecks(LVL.getPotentiallyFaultingLoads(), PSE,
+ VF.Width);
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+ }
// Check if it is profitable to vectorize with runtime checks.
bool ForceVectorization =
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
index 8df0eaec6a8c9d..d106b0c0921de4 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -208,7 +208,7 @@ loop.end:
define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_allocas'
-; CHECK: LV: Not vectorizing: Loop may fault.
+; CHECK: LV: Not vectorizing: Not worth vectorizing loop with uncountable early exit, due to number of potentially faulting loads.
entry:
%p1 = alloca [42 x i8]
%p2 = alloca [42 x i8]
@@ -238,7 +238,7 @@ loop.end:
define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) {
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_deref_ptrs'
-; CHECK: LV: Not vectorizing: Loop may fault.
+; CHECK: LV: Not vectorizing: Not worth vectorizing loop with uncountable early exit, due to number of potentially faulting loads.
entry:
br label %loop
@@ -264,7 +264,7 @@ loop.end:
define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) {
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_unknown_ptrs'
-; CHECK: LV: Not vectorizing: Loop may fault.
+; CHECK: LV: Not vectorizing: Not worth vectorizing loop with uncountable early exit, due to number of potentially faulting loads.
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll
index c68eeac19c9ecf..12c46f7f60123d 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s
declare void @init_mem(ptr, i64);
@@ -141,3 +141,78 @@ loop.end:
%retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
ret i64 %retval
}
+
+define i64 @same_exit_block_pre_inc_use1_unknown_single_ptr(ptr %p1) {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_single_ptr(
+; CHECK-SAME: ptr [[P1:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P11:%.*]] = ptrtoint ptr [[P1]] to i64
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[P11]] to i2
+; CHECK-NEXT: [[TMP1:%.*]] = add i2 [[TMP0]], -1
+; CHECK-NEXT: [[TMP2:%.*]] = zext i2 [[TMP1]] to i64
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX2]]
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], splat (i8 3)
+; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 4
+; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
+; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
+; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.split:
+; CHECK-NEXT: br i1 [[TMP8]], label [[LOOP_END:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[VECTOR_SCEVCHECK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP1:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], 3
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK: loop.inc:
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: loop.end:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 1, [[LOOP1]] ], [ 0, [[LOOP_INC]] ], [ 0, [[MIDDLE_BLOCK]] ], [ 1, [[MIDDLE_SPLIT]] ]
+; CHECK-NEXT: ret i64 [[RETVAL]]
+;
+entry:
+ br label %loop
+
+loop:
+ %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+ %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+ %ld1 = load i8, ptr %arrayidx, align 1
+ %cmp3 = icmp eq i8 %ld1, 3
+ br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+ %index.next = add i64 %index, 1
+ %exitcond = icmp ne i64 %index.next, 67
+ br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+ %retval = phi i64 [ 1, %loop ], [ 0, %loop.inc ]
+ ret i64 %retval
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+;.
More information about the llvm-commits
mailing list