[llvm] [LSR] Require non-zero step when considering wrap around for term fol… (PR #77809)

Thu Jan 11 09:57:45 PST 2024

https://github.com/preames created https://github.com/llvm/llvm-project/pull/77809

…ding

The term folding logic needs to prove that the induction variable does not cycle through the same set of values so that testing for the value of the IV on the exiting iteration is guaranteed to trigger only on that iteration. The prior code checked the no-self-wrap property on the IV, but this is insufficient as a zero step is trivially no-self-wrap per SCEV's definition but does repeat the same series of values.

In the current form, this has the effect of basically disabling lsr's term-folding for all non-constant strides.  This is still a net improvement as we've disabled term-folding entirely, so being able to enable it for constant strides is still a net improvement.

As future work, there's two SCEV weakness worth investigating.

First sext (or i32 %a, 1) to i64 does not return true for isKnownNonZero. This is because we check only the unsigned range in that query.  We could either do query pushdown, or check the signed range as well.  I tried the second locally and it has very broad impact - i.e. we have a bunch of missing optimizations here.

Second, zext (or i32 %a, 1) to i64 as the increment to the IV in expensive_expand_short_tc causes the addrec to no longer be provably no-self-wrap.  I didn't investigate this so it might be neccessary, but the loop structure is such that I find this result surprising.

>From 56e6e3d01fdab2e4bc832bb797e51130363d7bb4 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 11 Jan 2024 08:58:14 -0800
Subject: [PATCH] [LSR] Require non-zero step when considering wrap around for
 term folding

The term folding logic needs to prove that the induction variable does not
cycle through the same set of values so that testing for the value of the
IV on the exiting iteration is guaranteed to trigger only on that iteration.
The prior code checked the no-self-wrap property on the IV, but this is
insufficient as a zero step is trivially no-self-wrap per SCEV's definition
but does repeat the same series of values.

In the current form, this has the effect of basically disabling lsr's
term-folding for all non-constant strides.  This is still a net improvement
as we've disabled term-folding entirely, so being able to enable it for
constant strides is still a net improvement.

As future work, there's two SCEV weakness worth investigating.

First sext (or i32 %a, 1) to i64 does not return true for isKnownNonZero.
This is because we check only the unsigned range in that query.  We could
either do query pushdown, or check the signed range as well.  I tried the
second locally and it has very broad impact - i.e. we have a bunch of
missing optimizations here.

Second, zext (or i32 %a, 1) to i64 as the increment to the IV in
expensive_expand_short_tc causes the addrec to no longer be provably
no-self-wrap.  I didn't investigate this so it might be neccessary,
but the loop structure is such that I find this result surprising.
---
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |  3 +-
 .../LoopStrengthReduce/lsr-term-fold.ll       | 63 +++++++++----------
 2 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index a58bbe31856383..e496f79a8f8ce0 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -6816,7 +6816,8 @@ canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
     // iteration. The simplest case to consider is a candidate IV which is
     // narrower than the trip count (and thus original IV), but this can
     // also happen due to non-unit strides on the candidate IVs.
-    if (!AddRec->hasNoSelfWrap())
+    if (!AddRec->hasNoSelfWrap() ||
+        !SE.isKnownNonZero(AddRec->getStepRecurrence(SE)))
       continue;
 
     const SCEVAddRecExpr *PostInc = AddRec->getPostIncExpr(SE);
diff --git a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll
index 0203abe69ac299..cb37f7b7bc3dbf 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll
@@ -474,28 +474,27 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
+; TODO: This case should be legal, but we run into a problem with SCEV's
+; ability to prove non-zero for sext expressions.
 define void @expensive_expand_short_tc(ptr %a, i32 %offset, i32 %n) {
 ; CHECK-LABEL: @expensive_expand_short_tc(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_NONZERO:%.*]] = or i32 [[OFFSET:%.*]], 1
 ; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 84
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[OFFSET:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], 84
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[UGLYGEP]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    store i32 1, ptr [[LSR_IV1]], align 4
-; CHECK-NEXT:    [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET]]
-; CHECK-NEXT:    [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[SCEVGEP]]
-; CHECK-NEXT:    br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], 1
+; CHECK-NEXT:    [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET_NONZERO]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %offset.nonzero = or i32 %offset, 1
   %uglygep = getelementptr i8, ptr %a, i64 84
   br label %for.body
 
@@ -504,7 +503,7 @@ for.body:                                         ; preds = %for.body, %entry
   %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 0, %entry ]
   store i32 1, ptr %lsr.iv1, align 4
   %lsr.iv.next = add nsw i32 %lsr.iv, 1
-  %uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset
+  %uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset.nonzero
   %exitcond.not = icmp eq i32 %lsr.iv.next, %n
   br i1 %exitcond.not, label %for.end, label %for.body, !prof !{!"branch_weights", i32 1, i32 3}
 
@@ -512,28 +511,27 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
+; TODO: This case should be legal, but we run into a problem with SCEV's
+; ability to prove non-zero for sext expressions.
 define void @expensive_expand_long_tc(ptr %a, i32 %offset, i32 %n) {
 ; CHECK-LABEL: @expensive_expand_long_tc(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_NONZERO:%.*]] = or i32 [[OFFSET:%.*]], 1
 ; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 84
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[OFFSET:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], 84
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[UGLYGEP]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    store i32 1, ptr [[LSR_IV1]], align 4
-; CHECK-NEXT:    [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET]]
-; CHECK-NEXT:    [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[SCEVGEP]]
-; CHECK-NEXT:    br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], 1
+; CHECK-NEXT:    [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET_NONZERO]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %offset.nonzero = or i32 %offset, 1
   %uglygep = getelementptr i8, ptr %a, i64 84
   br label %for.body
 
@@ -542,7 +540,7 @@ for.body:                                         ; preds = %for.body, %entry
   %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 0, %entry ]
   store i32 1, ptr %lsr.iv1, align 4
   %lsr.iv.next = add nsw i32 %lsr.iv, 1
-  %uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset
+  %uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset.nonzero
   %exitcond.not = icmp eq i32 %lsr.iv.next, %n
   br i1 %exitcond.not, label %for.end, label %for.body, !prof !{!"branch_weights", i32 1, i32 300}
 
@@ -550,28 +548,27 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
+; TODO: This case should be legal, but we run into a problem with SCEV's
+; ability to prove non-zero for sext expressions.
 define void @expensive_expand_unknown_tc(ptr %a, i32 %offset, i32 %n) {
 ; CHECK-LABEL: @expensive_expand_unknown_tc(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_NONZERO:%.*]] = or i32 [[OFFSET:%.*]], 1
 ; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 84
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[OFFSET:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], 84
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[UGLYGEP]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    store i32 1, ptr [[LSR_IV1]], align 4
-; CHECK-NEXT:    [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET]]
-; CHECK-NEXT:    [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[SCEVGEP]]
-; CHECK-NEXT:    br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], 1
+; CHECK-NEXT:    [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET_NONZERO]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %offset.nonzero = or i32 %offset, 1
   %uglygep = getelementptr i8, ptr %a, i64 84
   br label %for.body
 
@@ -580,7 +577,7 @@ for.body:                                         ; preds = %for.body, %entry
   %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 0, %entry ]
   store i32 1, ptr %lsr.iv1, align 4
   %lsr.iv.next = add nsw i32 %lsr.iv, 1
-  %uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset
+  %uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset.nonzero
   %exitcond.not = icmp eq i32 %lsr.iv.next, %n
   br i1 %exitcond.not, label %for.end, label %for.body