[llvm] [WIP][LoopVectorize] Perform loop versioning for some early exit loops (PR #120603)

David Sherwood via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 19 08:34:01 PST 2024


https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/120603

None

>From e0e8ca8b1bd4d09d611410b8067d41ba00e9054e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 19 Dec 2024 16:31:51 +0000
Subject: [PATCH] [WIP][LoopVectorize] Perform loop versioning for some early
 exit loops

---
 llvm/include/llvm/Analysis/Loads.h            |  3 +-
 .../Vectorize/LoopVectorizationLegality.h     | 19 +++++
 llvm/lib/Analysis/Loads.cpp                   | 21 +++--
 .../Vectorize/LoopVectorizationLegality.cpp   | 52 +++++++++++--
 .../Transforms/Vectorize/LoopVectorize.cpp    | 53 +++++++++++--
 .../LoopVectorize/early_exit_legality.ll      |  6 +-
 .../single_early_exit_unsafe_ptrs.ll          | 77 ++++++++++++++++++-
 7 files changed, 207 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 639070c07897b0..828f64e0e59432 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -92,7 +92,8 @@ bool isDereferenceableAndAlignedInLoop(
 /// contains read-only memory accesses.
 bool isDereferenceableReadOnlyLoop(
     Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
-    SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
+    SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr,
+    SmallVectorImpl<LoadInst *> *NonDerefLoads = nullptr);
 
 /// Return true if we know that executing a load from this value cannot trap.
 ///
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index fbe80eddbae07a..7f6f8f9c3f5bfc 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -425,6 +425,19 @@ class LoopVectorizationLegality {
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
+  /// Return the number of loads in the loop we have to consider that could
+  /// potentially fault in a loop with uncountable early exits.
+  unsigned getNumPotentiallyFaultingLoads() const {
+    return PotentiallyFaultingLoads.size();
+  }
+
+  /// Return a vector of all potentially faulting loads in a loop with
+  /// uncountable early exits.
+  const SmallVectorImpl<std::pair<LoadInst *, const SCEV *>> *
+  getPotentiallyFaultingLoads() const {
+    return &PotentiallyFaultingLoads;
+  }
+
   /// Returns a HistogramInfo* for the given instruction if it was determined
   /// to be part of a load -> update -> store sequence where multiple lanes
   /// may be working on the same memory address.
@@ -533,6 +546,8 @@ class LoopVectorizationLegality {
   /// additional cases safely.
   bool isVectorizableEarlyExitLoop();
 
+  bool analyzePotentiallyFaultingLoads(SmallVectorImpl<LoadInst *> *Loads);
+
   /// Return true if all of the instructions in the block can be speculatively
   /// executed, and record the loads/stores that require masking.
   /// \p SafePtrs is a list of addresses that are known to be legal and we know
@@ -656,6 +671,10 @@ class LoopVectorizationLegality {
   /// Keep track of the destinations of all uncountable exits if the
   /// exact backedge taken count is not computable.
   SmallVector<BasicBlock *, 4> UncountableExitBlocks;
+
+  /// Keep a record of all potentially faulting loads in loops with
+  /// uncountable early exits.
+  SmallVector<std::pair<LoadInst *, const SCEV *>, 4> PotentiallyFaultingLoads;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 54b9521fda8fd2..7818dcf84278fa 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -808,15 +808,26 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To,
 
 bool llvm::isDereferenceableReadOnlyLoop(
     Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
-    SmallVectorImpl<const SCEVPredicate *> *Predicates) {
+    SmallVectorImpl<const SCEVPredicate *> *Predicates,
+    SmallVectorImpl<LoadInst *> *NonDerefLoads) {
+  bool Result = true;
   for (BasicBlock *BB : L->blocks()) {
     for (Instruction &I : *BB) {
       if (auto *LI = dyn_cast<LoadInst>(&I)) {
-        if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates))
+        if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC,
+                                               Predicates)) {
+          if (!NonDerefLoads)
+            return false;
+          NonDerefLoads->push_back(LI);
+          Result = false;
+        }
+      } else if (I.mayReadFromMemory() || I.mayWriteToMemory() ||
+                 I.mayThrow()) {
+        if (!NonDerefLoads)
           return false;
-      } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
-        return false;
+        Result = false;
+      }
     }
   }
-  return true;
+  return Result;
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 555c8435dd330d..e7cd6c433476a1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1589,6 +1589,39 @@ bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
   return Result;
 }
 
+bool LoopVectorizationLegality::analyzePotentiallyFaultingLoads(
+    SmallVectorImpl<LoadInst *> *Loads) {
+  LLVM_DEBUG(dbgs() << "Found potentially faulting loads in loop with "
+                       "uncountable early exit:\n");
+  for (LoadInst *LI : *Loads) {
+    LLVM_DEBUG(dbgs() << "Load: " << *LI << '\n');
+    Value *Ptr = LI->getPointerOperand();
+    if (!Ptr)
+      return false;
+    const SCEV *PtrExpr = PSE.getSCEV(Ptr);
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
+    // TODO: Deal with loop invariant pointers.
+    if (!AR || AR->getLoop() != TheLoop || !AR->isAffine())
+      return false;
+    auto Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*PSE.getSE()));
+    if (!Step)
+      return false;
+    const SCEV *Start = AR->getStart();
+
+    // Make sure the step is positive and matches the object size in memory.
+    // TODO: Extend this to cover more cases.
+    auto &DL = LI->getDataLayout();
+    APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
+                  DL.getTypeStoreSize(LI->getType()).getFixedValue());
+    if (EltSize != Step->getAPInt())
+      return false;
+
+    LLVM_DEBUG(dbgs() << "SCEV for Load Ptr: " << *Start << 'n');
+    PotentiallyFaultingLoads.push_back({LI, Start});
+  }
+  return true;
+}
+
 bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
   BasicBlock *LatchBB = TheLoop->getLoopLatch();
   if (!LatchBB) {
@@ -1713,15 +1746,18 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
   assert(LatchBB->getUniquePredecessor() == getUncountableEarlyExitingBlock() &&
          "Expected latch predecessor to be the early exiting block");
 
-  // TODO: Handle loops that may fault.
   Predicates.clear();
-  if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
-                                     &Predicates)) {
-    reportVectorizationFailure(
-        "Loop may fault",
-        "Cannot vectorize potentially faulting early exit loop",
-        "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
-    return false;
+  SmallVector<LoadInst *, 4> Loads;
+  if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, &Predicates,
+                                     &Loads)) {
+    if (!analyzePotentiallyFaultingLoads(&Loads)) {
+      reportVectorizationFailure(
+          "Loop may fault",
+          "Cannot vectorize potentially faulting early exit loop",
+          "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+      return false;
+    }
+    LLVM_DEBUG(dbgs() << "We can vectorize the loop with runtime checks.\n");
   }
 
   [[maybe_unused]] const SCEV *SymbolicMaxBTC =
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a8511483e00fbe..a593259506cdd3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2135,6 +2135,29 @@ class GeneratedRTChecks {
 };
 } // namespace
 
+static void addPointerAlignmentChecks(
+    const SmallVectorImpl<std::pair<LoadInst *, const SCEV *>> *Loads,
+    PredicatedScalarEvolution &PSE, ElementCount VF) {
+  ScalarEvolution *SE = PSE.getSE();
+  const DataLayout &DL = SE->getDataLayout();
+  Type *PtrIntType = DL.getIntPtrType(SE->getContext());
+
+  const SCEV *Zero = SE->getZero(PtrIntType);
+  const SCEV *ScevEC = SE->getElementCount(PtrIntType, VF);
+
+  for (auto Load : *Loads) {
+    APInt EltSize(
+        DL.getIndexTypeSizeInBits(Load.first->getPointerOperandType()),
+        DL.getTypeStoreSize(Load.first->getType()).getFixedValue());
+    const SCEV *Start = SE->getPtrToIntExpr(Load.second, PtrIntType);
+    const SCEV *Align =
+        SE->getMulExpr(ScevEC, SE->getConstant(EltSize),
+                       (SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW));
+    const SCEV *Rem = SE->getURemExpr(Start, Align);
+    PSE.addPredicate(*(SE->getEqualPredicate(Rem, Zero)));
+  }
+}
+
 static bool useActiveLaneMask(TailFoldingStyle Style) {
   return Style == TailFoldingStyle::Data ||
          Style == TailFoldingStyle::DataAndControlFlow ||
@@ -10236,11 +10259,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
-    reportVectorizationFailure("Auto-vectorization of loops with uncountable "
-                               "early exit is not enabled",
-                               "UncountableEarlyExitLoopsDisabled", ORE, L);
-    return false;
+  if (LVL.hasUncountableEarlyExit()) {
+    if (!EnableEarlyExitVectorization) {
+      reportVectorizationFailure("Auto-vectorization of loops with uncountable "
+                                 "early exit is not enabled",
+                                 "UncountableEarlyExitLoopsDisabled", ORE, L);
+      return false;
+    }
+
+    unsigned NumPotentiallyFaultingPointers =
+        LVL.getNumPotentiallyFaultingLoads();
+    if (NumPotentiallyFaultingPointers > 1) {
+      reportVectorizationFailure("Not worth vectorizing loop with uncountable "
+                                 "early exit, due to number of potentially "
+                                 "faulting loads",
+                                 "UncountableEarlyExitMayFault", ORE, L);
+      return false;
+    } else if (NumPotentiallyFaultingPointers)
+      LLVM_DEBUG(dbgs() << "LV: Need to version early-exit vector loop with"
+                        << "pointer alignment checks.\n");
   }
 
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
@@ -10391,8 +10428,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     unsigned SelectedIC = std::max(IC, UserIC);
     //  Optimistically generate runtime checks if they are needed. Drop them if
     //  they turn out to not be profitable.
-    if (VF.Width.isVector() || SelectedIC > 1)
+    if (VF.Width.isVector() || SelectedIC > 1) {
+      if (LVL.getNumPotentiallyFaultingLoads())
+        addPointerAlignmentChecks(LVL.getPotentiallyFaultingLoads(), PSE,
+                                  VF.Width);
       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+    }
 
     // Check if it is profitable to vectorize with runtime checks.
     bool ForceVectorization =
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
index 8df0eaec6a8c9d..d106b0c0921de4 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -208,7 +208,7 @@ loop.end:
 
 define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
 ; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_allocas'
-; CHECK:       LV: Not vectorizing: Loop may fault.
+; CHECK:       LV: Not vectorizing: Not worth vectorizing loop with uncountable early exit, due to number of potentially faulting loads.
 entry:
   %p1 = alloca [42 x i8]
   %p2 = alloca [42 x i8]
@@ -238,7 +238,7 @@ loop.end:
 
 define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) {
 ; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_deref_ptrs'
-; CHECK:       LV: Not vectorizing: Loop may fault.
+; CHECK:       LV: Not vectorizing: Not worth vectorizing loop with uncountable early exit, due to number of potentially faulting loads.
 entry:
   br label %loop
 
@@ -264,7 +264,7 @@ loop.end:
 
 define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_unknown_ptrs'
-; CHECK:       LV: Not vectorizing: Loop may fault.
+; CHECK:       LV: Not vectorizing: Not worth vectorizing loop with uncountable early exit, due to number of potentially faulting loads.
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll
index c68eeac19c9ecf..12c46f7f60123d 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s
 
 declare void @init_mem(ptr, i64);
 
@@ -141,3 +141,78 @@ loop.end:
   %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
   ret i64 %retval
 }
+
+define i64 @same_exit_block_pre_inc_use1_unknown_single_ptr(ptr %p1) {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_single_ptr(
+; CHECK-SAME: ptr [[P1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P11:%.*]] = ptrtoint ptr [[P1]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[P11]] to i2
+; CHECK-NEXT:    [[TMP1:%.*]] = add i2 [[TMP0]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i2 [[TMP1]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], splat (i8 3)
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP8]], label [[LOOP_END:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[VECTOR_SCEVCHECK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], 3
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 1, [[LOOP1]] ], [ 0, [[LOOP_INC]] ], [ 0, [[MIDDLE_BLOCK]] ], [ 1, [[MIDDLE_SPLIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %cmp3 = icmp eq i8 %ld1, 3
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 1, %loop ], [ 0, %loop.inc ]
+  ret i64 %retval
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+;.



More information about the llvm-commits mailing list