[llvm] f5dd70c - [LSR] Require non-zero step when considering wrap around for term folding (#77809)

Thu Jan 11 10:07:21 PST 2024

Author: Philip Reames
Date: 2024-01-11T10:07:17-08:00
New Revision: f5dd70c58277d925710e5a7c25c86d7565cc3c6c

URL: https://github.com/llvm/llvm-project/commit/f5dd70c58277d925710e5a7c25c86d7565cc3c6c
DIFF: https://github.com/llvm/llvm-project/commit/f5dd70c58277d925710e5a7c25c86d7565cc3c6c.diff

LOG: [LSR] Require non-zero step when considering wrap around for term folding (#77809)

The term folding logic needs to prove that the induction variable does
not cycle through the same set of values so that testing for the value
of the IV on the exiting iteration is guaranteed to trigger only on that
iteration. The prior code checked the no-self-wrap property on the IV,
but this is insufficient as a zero step is trivially no-self-wrap per
SCEV's definition but does repeat the same series of values.

In the current form, this has the effect of basically disabling lsr's
term-folding for all non-constant strides. This is still a net
improvement as we've disabled term-folding entirely, so being able to
enable it for constant strides is still a net improvement.

As future work, there's two SCEV weakness worth investigating.

First sext (or i32 %a, 1) to i64 does not return true for
isKnownNonZero. This is because we check only the unsigned range in that
query. We could either do query pushdown, or check the signed range as
well. I tried the second locally and it has very broad impact - i.e. we
have a bunch of missing optimizations here.

Second, zext (or i32 %a, 1) to i64 as the increment to the IV in
expensive_expand_short_tc causes the addrec to no longer be provably
no-self-wrap. I didn't investigate this so it might be necessary, but
the loop structure is such that I find this result surprising.

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
    llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index a58bbe31856383..e496f79a8f8ce0 100644

--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -6816,7 +6816,8 @@ canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
     // iteration. The simplest case to consider is a candidate IV which is
     // narrower than the trip count (and thus original IV), but this can
     // also happen due to non-unit strides on the candidate IVs.
-    if (!AddRec->hasNoSelfWrap())
+    if (!AddRec->hasNoSelfWrap() ||
+        !SE.isKnownNonZero(AddRec->getStepRecurrence(SE)))
       continue;
 
     const SCEVAddRecExpr *PostInc = AddRec->getPostIncExpr(SE);

diff  --git a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll
index 0203abe69ac299..cb37f7b7bc3dbf 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll
@@ -474,28 +474,27 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
+; TODO: This case should be legal, but we run into a problem with SCEV's
+; ability to prove non-zero for sext expressions.
 define void @expensive_expand_short_tc(ptr %a, i32 %offset, i32 %n) {
 ; CHECK-LABEL: @expensive_expand_short_tc(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_NONZERO:%.*]] = or i32 [[OFFSET:%.*]], 1
 ; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 84
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[OFFSET:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], 84
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[UGLYGEP]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    store i32 1, ptr [[LSR_IV1]], align 4
-; CHECK-NEXT:    [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET]]
-; CHECK-NEXT:    [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[SCEVGEP]]
-; CHECK-NEXT:    br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], 1
+; CHECK-NEXT:    [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET_NONZERO]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %offset.nonzero = or i32 %offset, 1
   %uglygep = getelementptr i8, ptr %a, i64 84
   br label %for.body
 
@@ -504,7 +503,7 @@ for.body:                                         ; preds = %for.body, %entry
   %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 0, %entry ]
   store i32 1, ptr %lsr.iv1, align 4
   %lsr.iv.next = add nsw i32 %lsr.iv, 1
-  %uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset
+  %uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset.nonzero
   %exitcond.not = icmp eq i32 %lsr.iv.next, %n
   br i1 %exitcond.not, label %for.end, label %for.body, !prof !{!"branch_weights", i32 1, i32 3}
 
@@ -512,28 +511,27 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
+; TODO: This case should be legal, but we run into a problem with SCEV's
+; ability to prove non-zero for sext expressions.
 define void @expensive_expand_long_tc(ptr %a, i32 %offset, i32 %n) {
 ; CHECK-LABEL: @expensive_expand_long_tc(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_NONZERO:%.*]] = or i32 [[OFFSET:%.*]], 1
 ; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 84
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[OFFSET:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], 84
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[UGLYGEP]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    store i32 1, ptr [[LSR_IV1]], align 4
-; CHECK-NEXT:    [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET]]
-; CHECK-NEXT:    [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[SCEVGEP]]
-; CHECK-NEXT:    br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], 1
+; CHECK-NEXT:    [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET_NONZERO]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %offset.nonzero = or i32 %offset, 1
   %uglygep = getelementptr i8, ptr %a, i64 84
   br label %for.body
 
@@ -542,7 +540,7 @@ for.body:                                         ; preds = %for.body, %entry
   %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 0, %entry ]
   store i32 1, ptr %lsr.iv1, align 4
   %lsr.iv.next = add nsw i32 %lsr.iv, 1
-  %uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset
+  %uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset.nonzero
   %exitcond.not = icmp eq i32 %lsr.iv.next, %n
   br i1 %exitcond.not, label %for.end, label %for.body, !prof !{!"branch_weights", i32 1, i32 300}
 
@@ -550,28 +548,27 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
+; TODO: This case should be legal, but we run into a problem with SCEV's
+; ability to prove non-zero for sext expressions.
 define void @expensive_expand_unknown_tc(ptr %a, i32 %offset, i32 %n) {
 ; CHECK-LABEL: @expensive_expand_unknown_tc(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_NONZERO:%.*]] = or i32 [[OFFSET:%.*]], 1
 ; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 84
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[OFFSET:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], 84
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[UGLYGEP]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    store i32 1, ptr [[LSR_IV1]], align 4
-; CHECK-NEXT:    [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET]]
-; CHECK-NEXT:    [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[SCEVGEP]]
-; CHECK-NEXT:    br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], 1
+; CHECK-NEXT:    [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET_NONZERO]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %offset.nonzero = or i32 %offset, 1
   %uglygep = getelementptr i8, ptr %a, i64 84
   br label %for.body
 
@@ -580,7 +577,7 @@ for.body:                                         ; preds = %for.body, %entry
   %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 0, %entry ]
   store i32 1, ptr %lsr.iv1, align 4
   %lsr.iv.next = add nsw i32 %lsr.iv, 1
-  %uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset
+  %uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset.nonzero
   %exitcond.not = icmp eq i32 %lsr.iv.next, %n
   br i1 %exitcond.not, label %for.end, label %for.body