[llvm] [LoopVectorize] Teach LoopVectorizationLegality about more early exits (PR #107004)

Fri Sep 13 08:50:06 PDT 2024

================
@@ -1442,6 +1487,126 @@ bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
   return Result;
 }
 
+bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
+  // At least one of the exiting blocks must be the latch.
+  BasicBlock *LatchBB = TheLoop->getLoopLatch();
+  if (!LatchBB) {
+    reportVectorizationFailure("Loop does not have a latch",
+                               "Cannot vectorize early exit loop",
+                               "NoLatchEarlyExit", ORE, TheLoop);
+    return false;
+  }
+
+  if (Reductions.size() || FixedOrderRecurrences.size()) {
+    reportVectorizationFailure(
+        "Found reductions or recurrences in early-exit loop",
+        "Cannot vectorize early exit loop with reductions or recurrences",
+        "RecurrencesInEarlyExitLoop", ORE, TheLoop);
+    return false;
+  }
+
+  SmallVector<BasicBlock *, 8> ExitingBlocks;
+  TheLoop->getExitingBlocks(ExitingBlocks);
+
+  // Keep a record of all the exiting blocks.
+  SmallVector<const SCEVPredicate *, 4> Predicates;
+  for (BasicBlock *BB1 : ExitingBlocks) {
+    const SCEV *EC =
+        PSE.getSE()->getPredicatedExitCount(TheLoop, BB1, &Predicates);
+    if (isa<SCEVCouldNotCompute>(EC)) {
+      UncountableExitingBlocks.push_back(BB1);
+
+      SmallVector<BasicBlock *, 2> Succs(successors(BB1));
+      if (Succs.size() != 2) {
+        reportVectorizationFailure(
+            "Early exiting block does not have exactly two successors",
+            "Incorrect number of successors from early exiting block",
+            "EarlyExitTooManySuccessors", ORE, TheLoop);
+        return false;
+      }
+
+      BasicBlock *BB2;
+      if (!TheLoop->contains(Succs[0]))
+        BB2 = Succs[0];
+      else {
+        assert(!TheLoop->contains(Succs[1]));
+        BB2 = Succs[1];
+      }
+      UncountableExitBlocks.push_back(BB2);
+    } else
+      CountableExitingBlocks.push_back(BB1);
+  }
+  Predicates.clear();
+
+  // We only support one uncountable early exit.
+  if (getUncountableExitingBlocks().size() != 1) {
+    reportVectorizationFailure(
+        "Loop has too many uncountable exits",
+        "Cannot vectorize early exit loop with more than one early exit",
+        "TooManyUncountableEarlyExits", ORE, TheLoop);
+    return false;
+  }
+
+  // The only supported early exit loops so far are ones where the early
+  // exiting block is a unique predecessor of the latch block.
+  BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor();
+  if (!LatchPredBB || LatchPredBB != getUncountableExitingBlocks()[0]) {
+    reportVectorizationFailure("Early exit is not the latch predecessor",
+                               "Cannot vectorize early exit loop",
+                               "EarlyExitNotLatchPredecessor", ORE, TheLoop);
+    return false;
+  }
+
+  // Check all instructions in the loop to see if they could potentially
+  // generate exceptions or have side-effects.
+  auto IsSafeOperation = [](Instruction *I) -> bool {
+    // Is this a divide?
+    switch (I->getOpcode()) {
+    case Instruction::Load:
+    case Instruction::Store:
+    case Instruction::PHI:
+    case Instruction::Br:
+      // These are checked separately. For example, canVectorizeMemory will
+      // analyze the loads and stores in the loop.
+      return true;
+    default:
+      return isSafeToSpeculativelyExecute(I);
+    }
+  };
+
+  for (auto *BB : TheLoop->blocks())
+    for (auto &I : *BB)
+      if (!IsSafeOperation(&I)) {
----------------
david-arm wrote:

I think you have to restrict for all blocks up to and including the early exit too, because any operation after the vector lane that triggers the exit could be unsafe. For example, see test `@loop_contains_unsafe_div` where the early exit block looks like this:

```
loop:
  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
  %ld1 = load i32, ptr %arrayidx, align 1
  %div = udiv i32 20000, %ld1
  %cmp = icmp eq i32 %div, 1
  br i1 %cmp, label %loop.inc, label %loop.end
```

It could be lane 1 of the comparison that triggers the early exit, but lane 3 could trap with divide-by-zero. If this was a normal loop where the divide was conditionally executed, i.e. something like

```
  for (int i = 0; i < n; i++) {
    if (mask[i]) {
      dst[i] = 20000 / src[i];
    }
  }
```

then the vectoriser can at least create a vector expression for the mask and select between `src` and a vector of ones to avoid unnecessarily creating an exception. However, in the test above the vectoriser has to perform the divide first in order to know what the mask should be.

I suppose in future improvements if we can prove that the `udiv` can be safely moved after the vector comparison (i.e. because the comparison does not depend upon the `udiv`) then we might be able to create a mask and select between src and 1.

For now, I thought it best to avoid over-complicating the initial early exit vectorisation work by bailing out on edge cases like this.

https://github.com/llvm/llvm-project/pull/107004