[llvm] [SCEV] Support scaled vscale as step in howFarToZero (PR #94411)

Tue Jun 4 16:17:39 PDT 2024

https://github.com/preames created https://github.com/llvm/llvm-project/pull/94411

VF * vscale is the canonical step for a scalably vectorized loop, and LFTR canonicalizes to NE loop tests, so having our trip count logic be unable to compute trip counts for such loops is unfortunate.

The existing code needed minimal generalization to handle non-constant strides.  The tricky cases to be sure we handle correctly are: zero, and -1 (due to the special case of abs(-1) being non-positive).

This patch does the full generalization in terms of code structure, but only lets it kick in when the non-constant is a (C * vscale) node.  I did some quick investigation, and it seems the context free non-zero, and sign checks are basically never disproved for arbitrary scales. I think we have alternate tactics available for these, but I'm going to return to that in a separate patch.

>From fcbdf949e996050da67fbd7a8706a7ee06d547b7 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Tue, 4 Jun 2024 16:02:48 -0700
Subject: [PATCH] [SCEV] Support scaled vscale as step in howFarToZero

VF * vscale is the canonical step for a scalably vectorized loop, and
LFTR canonicalizes to NE loop tests, so having our trip count logic be
unable to compute trip counts for such loops is unfortunate.

The existing code needed minimal generalization to handle non-constant
strides.  The tricky cases to be sure we handle correctly are: zero,
and -1 (due to the special case of abs(-1) being non-positive).

This patch does the full generalization in terms of code structure, but
only lets it kick in when the non-constant is a (C * vscale) node.  I
did some quick investigation, and it seems the context free non-zero,
and sign checks are basically never disproved for arbitrary scales.
I think we have alternate tactics available for these, but I'm going
to return to that in a separate patch.
---
 llvm/lib/Analysis/ScalarEvolution.cpp         | 38 +++++++++++++------
 .../ScalarEvolution/scalable-vector.ll        | 11 +++---
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 3b9aa9ab623f8..e0e0b610993fa 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10422,6 +10422,17 @@ SolveQuadraticAddRecRange(const SCEVAddRecExpr *AddRec,
   return TruncIfPossible(MinOptional(SL.first, SU.first), BitWidth);
 }
 
+/// Return true if this is (C * vscale).  This is the canonical form
+/// of the step for loops vectorized with scalable vectors.
+static bool matchScaledVScale(const SCEV *S) {
+  const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S);
+  if (!Mul || Mul->getNumOperands() != 2)
+    return false;
+
+  return isa<SCEVConstant>(Mul->getOperand(0)) &&
+         Mul->getOperand(1)->getSCEVType() == scVScale;
+}
+
 ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V,
                                                          const Loop *L,
                                                          bool ControlsOnlyExit,
@@ -10483,15 +10494,15 @@ ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V,
   // Get the initial value for the loop.
   const SCEV *Start = getSCEVAtScope(AddRec->getStart(), L->getParentLoop());
   const SCEV *Step = getSCEVAtScope(AddRec->getOperand(1), L->getParentLoop());
-
-  // For now we handle only constant steps.
-  //
-  // TODO: Handle a nonconstant Step given AddRec<NUW>. If the
-  // AddRec is NUW, then (in an unsigned sense) it cannot be counting up to wrap
-  // to 0, it must be counting down to equal 0. Consequently, N = Start / -Step.
-  // We have not yet seen any such cases.
   const SCEVConstant *StepC = dyn_cast<SCEVConstant>(Step);
-  if (!StepC || StepC->getValue()->isZero())
+  // The code below is correct for arbitrary non-constant steps, but we won't be
+  // able to prove useful properties given the lack of use dependenct reasoning
+  // here.  To avoid spending compile time for no value, bail early unless the
+  // step is a "useful" non-constant value.
+  if (!StepC && !matchScaledVScale(Step))
+    return getCouldNotCompute();
+
+  if (!isLoopInvariant(Step, L) || !isKnownNonZero(Step))
     return getCouldNotCompute();
 
   // For positive steps (counting up until unsigned overflow):
@@ -10499,13 +10510,16 @@ ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V,
   // For negative steps (counting down to zero):
   //   N = Start/-Step
   // First compute the unsigned distance from zero in the direction of Step.
-  bool CountDown = StepC->getAPInt().isNegative();
-  const SCEV *Distance = CountDown ? Start : getNegativeSCEV(Start);
+  bool CountDown = isKnownNegative(Step);
+  if (!CountDown && !isKnownNonNegative(Step))
+    return getCouldNotCompute();
 
+  const SCEV *Distance = CountDown ? Start : getNegativeSCEV(Start);
   // Handle unitary steps, which cannot wraparound.
   // 1*N = -Start; -1*N = Start (mod 2^BW), so:
   //   N = Distance (as unsigned)
-  if (StepC->getValue()->isOne() || StepC->getValue()->isMinusOne()) {
+  if (StepC &&
+      (StepC->getValue()->isOne() || StepC->getValue()->isMinusOne())) {
     APInt MaxBECount = getUnsignedRangeMax(applyLoopGuards(Distance, L));
     MaxBECount = APIntOps::umin(MaxBECount, getUnsignedRangeMax(Distance));
 
@@ -10550,6 +10564,8 @@ ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V,
   }
 
   // Solve the general equation.
+  if (!StepC)
+    return getCouldNotCompute();
   const SCEV *E = SolveLinEquationWithOverflow(StepC->getAPInt(),
                                                getNegativeSCEV(Start), *this);
 
diff --git a/llvm/test/Analysis/ScalarEvolution/scalable-vector.ll b/llvm/test/Analysis/ScalarEvolution/scalable-vector.ll
index 81434d1bf064c..0a3ec4d66301b 100644
--- a/llvm/test/Analysis/ScalarEvolution/scalable-vector.ll
+++ b/llvm/test/Analysis/ScalarEvolution/scalable-vector.ll
@@ -91,13 +91,14 @@ define void @vscale_step_ne_tripcount(i64 %N) vscale_range(2, 1024) {
 ; CHECK-NEXT:    %n.vec = sub i64 %n.rnd.up, %n.mod.vf
 ; CHECK-NEXT:    --> (4 * vscale * ((-1 + (4 * vscale)<nuw><nsw> + %N) /u (4 * vscale)<nuw><nsw>)) U: [0,-3) S: [-9223372036854775808,9223372036854775805)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK-NEXT:    --> {0,+,(4 * vscale)<nuw><nsw>}<nuw><%vector.body> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: <<Unknown>> LoopDispositions: { %vector.body: Computable }
+; CHECK-NEXT:    --> {0,+,(4 * vscale)<nuw><nsw>}<nuw><%vector.body> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: (4 * vscale * ((-1 * vscale * (4 + (-4 * ((-1 + (4 * vscale)<nuw><nsw> + %N) /u (4 * vscale)<nuw><nsw>))<nsw>)<nsw>) /u (4 * vscale)<nuw><nsw>)) LoopDispositions: { %vector.body: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, %2
-; CHECK-NEXT:    --> {(4 * vscale)<nuw><nsw>,+,(4 * vscale)<nuw><nsw>}<nuw><%vector.body> U: [8,-3) S: [-9223372036854775808,9223372036854775805) Exits: <<Unknown>> LoopDispositions: { %vector.body: Computable }
+; CHECK-NEXT:    --> {(4 * vscale)<nuw><nsw>,+,(4 * vscale)<nuw><nsw>}<nuw><%vector.body> U: [8,-3) S: [-9223372036854775808,9223372036854775805) Exits: (vscale * (4 + (4 * ((-1 * vscale * (4 + (-4 * ((-1 + (4 * vscale)<nuw><nsw> + %N) /u (4 * vscale)<nuw><nsw>))<nsw>)<nsw>) /u (4 * vscale)<nuw><nsw>))<nuw><nsw>)<nuw>) LoopDispositions: { %vector.body: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @vscale_step_ne_tripcount
-; CHECK-NEXT:  Loop %vector.body: Unpredictable backedge-taken count.
-; CHECK-NEXT:  Loop %vector.body: Unpredictable constant max backedge-taken count.
-; CHECK-NEXT:  Loop %vector.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %vector.body: backedge-taken count is ((-1 * vscale * (4 + (-4 * ((-1 + (4 * vscale)<nuw><nsw> + %N) /u (4 * vscale)<nuw><nsw>))<nsw>)<nsw>) /u (4 * vscale)<nuw><nsw>)
+; CHECK-NEXT:  Loop %vector.body: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %vector.body: symbolic max backedge-taken count is ((-1 * vscale * (4 + (-4 * ((-1 + (4 * vscale)<nuw><nsw> + %N) /u (4 * vscale)<nuw><nsw>))<nsw>)<nsw>) /u (4 * vscale)<nuw><nsw>)
+; CHECK-NEXT:  Loop %vector.body: Trip multiple is 1
 ;
 entry:
   %0 = sub i64 -1, %N