[llvm] [LoopPeel] Support min/max intrinsics in loop peeling (PR #93162)

Sergey Kachkov via llvm-commits llvm-commits at lists.llvm.org
Wed May 29 02:43:54 PDT 2024


https://github.com/skachkov-sc updated https://github.com/llvm/llvm-project/pull/93162

>From f8107dfa8d18e1b35e2061be888dde5e82a735dd Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Thu, 16 May 2024 18:28:04 +0300
Subject: [PATCH 1/5] [LoopPeel] Add pre-commit test for min/max intrinsics

---
 .../peel-loop-min-max-intrinsics.ll           | 229 ++++++++++++++++++
 1 file changed, 229 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll

diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll
new file mode 100644
index 0000000000000..2a340790fc448
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -S -passes=loop-unroll -unroll-peel-max-count=2 | FileCheck %s
+
+declare void @foo(i32)
+
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+
+declare void @bar(i8)
+
+declare i8 @llvm.umin.i8(i8, i8)
+
+define void @test1(i32 %N) {
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP5_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP5_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COND:%.*]] = tail call i32 @llvm.umin.i32(i32 [[I_06]], i32 2)
+; CHECK-NEXT:    tail call void @foo(i32 [[COND]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_06]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp5.not = icmp eq i32 %N, 0
+  br i1 %cmp5.not, label %for.cond.cleanup, label %for.body
+
+for.body:
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %cond = tail call i32 @llvm.umin.i32(i32 %i.06, i32 2)
+  tail call void @foo(i32 %cond)
+  %inc = add nuw i32 %i.06, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @test2(i32 %N) {
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP5_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP5_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COND:%.*]] = tail call i32 @llvm.umax.i32(i32 [[I_06]], i32 2)
+; CHECK-NEXT:    tail call void @foo(i32 [[COND]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_06]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp5.not = icmp eq i32 %N, 0
+  br i1 %cmp5.not, label %for.cond.cleanup, label %for.body
+
+for.body:
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %cond = tail call i32 @llvm.umax.i32(i32 %i.06, i32 2)
+  tail call void @foo(i32 %cond)
+  %inc = add nuw i32 %i.06, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @test3(i32 %N) {
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COND:%.*]] = tail call i32 @llvm.smax.i32(i32 [[I_06]], i32 -2)
+; CHECK-NEXT:    tail call void @foo(i32 [[COND]])
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[I_06]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[DEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp5 = icmp slt i32 %N, 0
+  br i1 %cmp5, label %for.body, label %for.cond.cleanup
+
+for.body:
+  %i.06 = phi i32 [ %dec, %for.body ], [ 0, %entry ]
+  %cond = tail call i32 @llvm.smax.i32(i32 %i.06, i32 -2)
+  tail call void @foo(i32 %cond)
+  %dec = add nsw i32 %i.06, -1
+  %cmp = icmp sgt i32 %dec, %N
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @test4(i32 %N) {
+; CHECK-LABEL: define void @test4(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COND:%.*]] = tail call i32 @llvm.smin.i32(i32 [[I_06]], i32 -2)
+; CHECK-NEXT:    tail call void @foo(i32 noundef signext [[COND]])
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[I_06]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[DEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp5 = icmp slt i32 %N, 0
+  br i1 %cmp5, label %for.body, label %for.cond.cleanup
+
+for.body:
+  %i.06 = phi i32 [ %dec, %for.body ], [ 0, %entry ]
+  %cond = tail call i32 @llvm.smin.i32(i32 %i.06, i32 -2)
+  tail call void @foo(i32 noundef signext %cond)
+  %dec = add nsw i32 %i.06, -1
+  %cmp = icmp sgt i32 %dec, %N
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @test_negative(i32 %End, i32 %Step) {
+; CHECK-LABEL: define void @test_negative(
+; CHECK-SAME: i32 [[END:%.*]], i32 [[STEP:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT5:%.*]] = icmp eq i32 [[END]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT5]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COND:%.*]] = tail call i32 @llvm.smin.i32(i32 [[I_06]], i32 2)
+; CHECK-NEXT:    tail call void @foo(i32 [[COND]])
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[I_06]], [[STEP]]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[ADD]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp.not5 = icmp eq i32 %End, 0
+  br i1 %cmp.not5, label %for.cond.cleanup, label %for.body
+
+for.body:
+  %i.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %cond = tail call i32 @llvm.smin.i32(i32 %i.06, i32 2)
+  tail call void @foo(i32 %cond)
+  %add = add nsw i32 %i.06, %Step
+  %cmp.not = icmp eq i32 %add, %End
+  br i1 %cmp.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @test_wrap(i8 %N) {
+; CHECK-LABEL: define void @test_wrap(
+; CHECK-SAME: i8 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP5_NOT:%.*]] = icmp eq i8 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP5_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_06:%.*]] = phi i8 [ [[INC1:%.*]], [[FOR_BODY1]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COND1:%.*]] = tail call i8 @llvm.umin.i8(i8 [[I_06]], i8 -2)
+; CHECK-NEXT:    tail call void @bar(i8 [[COND1]])
+; CHECK-NEXT:    [[INC1]] = add i8 [[I_06]], 127
+; CHECK-NEXT:    [[EXITCOND_NOT1:%.*]] = icmp eq i8 [[INC1]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT1]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY1]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp5.not = icmp eq i8 %N, 0
+  br i1 %cmp5.not, label %for.cond.cleanup, label %for.body
+
+for.body:
+  %i.06 = phi i8 [ %inc, %for.body ], [ 0, %entry ]
+  %cond = tail call i8 @llvm.umin.i8(i8 %i.06, i8 254)
+  tail call void @bar(i8 %cond)
+  %inc = add i8 %i.06, 127
+  %exitcond.not = icmp eq i8 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}

>From 513cb4d457e20ec9114c645c2f02bf66df9dfa39 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Thu, 16 May 2024 18:34:52 +0300
Subject: [PATCH 2/5] [LoopPeel] Support min/max intrinsics in loop peeling

---
 llvm/lib/Transforms/Utils/LoopPeel.cpp        |  49 ++++++
 .../peel-loop-min-max-intrinsics.ll           | 164 +++++++++++++++---
 2 files changed, 193 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index f76fa3bb6c611..a6d37b9bdf11d 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -449,10 +449,59 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount);
   };
 
+  auto ComputePeelCountMinMax = [&](IntrinsicInst *II) {
+    bool IsSigned;
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::smax:
+    case Intrinsic::smin:
+      IsSigned = true;
+      break;
+    case Intrinsic::umax:
+    case Intrinsic::umin:
+      IsSigned = false;
+      break;
+    default:
+      return;
+    }
+    Value *LHS = II->getOperand(0), *RHS = II->getOperand(1);
+    const SCEV *BoundSCEV, *IterSCEV;
+    if (L.isLoopInvariant(LHS)) {
+      BoundSCEV = SE.getSCEV(LHS);
+      IterSCEV = SE.getSCEV(RHS);
+    } else if (L.isLoopInvariant(RHS)) {
+      BoundSCEV = SE.getSCEV(RHS);
+      IterSCEV = SE.getSCEV(LHS);
+    } else
+      return;
+    const auto *AddRec = dyn_cast<SCEVAddRecExpr>(IterSCEV);
+    // For simplicity, we support only affine recurrences.
+    if (!AddRec || !AddRec->isAffine() || AddRec->getLoop() != &L)
+      return;
+    const SCEV *Step = AddRec->getStepRecurrence(SE);
+    // To minimize number of peeled iterations, we use strict relational
+    // predicates here.
+    ICmpInst::Predicate Pred;
+    if (SE.isKnownPositive(Step))
+      Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+    else if (SE.isKnownNegative(Step))
+      Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+    else
+      return;
+    const SCEV *IterVal = AddRec->evaluateAtIteration(
+        SE.getConstant(AddRec->getType(), DesiredPeelCount), SE);
+    while (DesiredPeelCount < MaxPeelCount &&
+           SE.isKnownPredicate(Pred, IterVal, BoundSCEV)) {
+      IterVal = SE.getAddExpr(IterVal, Step);
+      ++DesiredPeelCount;
+    }
+  };
+
   for (BasicBlock *BB : L.blocks()) {
     for (Instruction &I : *BB) {
       if (SelectInst *SI = dyn_cast<SelectInst>(&I))
         ComputePeelCount(SI->getCondition(), 0);
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
+        ComputePeelCountMinMax(II);
     }
 
     auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll
index 2a340790fc448..b990067e398dc 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll
@@ -19,14 +19,37 @@ define void @test1(i32 %N) {
 ; CHECK-NEXT:    [[CMP5_NOT:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP5_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL_BEGIN:%.*]]
+; CHECK:       for.body.peel.begin:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL:%.*]]
+; CHECK:       for.body.peel:
+; CHECK-NEXT:    [[COND_PEEL:%.*]] = tail call i32 @llvm.umin.i32(i32 0, i32 2)
+; CHECK-NEXT:    tail call void @foo(i32 [[COND_PEEL]])
+; CHECK-NEXT:    [[INC_PEEL:%.*]] = add nuw i32 0, 1
+; CHECK-NEXT:    [[EXITCOND_NOT_PEEL:%.*]] = icmp eq i32 [[INC_PEEL]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_PEEL]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_PEEL_NEXT:%.*]]
+; CHECK:       for.body.peel.next:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL2:%.*]]
+; CHECK:       for.body.peel2:
+; CHECK-NEXT:    [[COND_PEEL3:%.*]] = tail call i32 @llvm.umin.i32(i32 [[INC_PEEL]], i32 2)
+; CHECK-NEXT:    tail call void @foo(i32 [[COND_PEEL3]])
+; CHECK-NEXT:    [[INC_PEEL4:%.*]] = add nuw i32 [[INC_PEEL]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_PEEL5:%.*]] = icmp eq i32 [[INC_PEEL4]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_PEEL5]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY_PEEL_NEXT1:%.*]]
+; CHECK:       for.body.peel.next1:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL_NEXT6:%.*]]
+; CHECK:       for.body.peel.next6:
+; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER_PEEL_NEWPH:%.*]]
+; CHECK:       for.body.preheader.peel.newph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[COND:%.*]] = tail call i32 @llvm.umin.i32(i32 [[I_06]], i32 2)
-; CHECK-NEXT:    tail call void @foo(i32 [[COND]])
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[INC_PEEL4]], [[FOR_BODY_PREHEADER_PEEL_NEWPH]] ]
+; CHECK-NEXT:    tail call void @foo(i32 2)
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_06]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
@@ -55,14 +78,37 @@ define void @test2(i32 %N) {
 ; CHECK-NEXT:    [[CMP5_NOT:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP5_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL_BEGIN:%.*]]
+; CHECK:       for.body.peel.begin:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL:%.*]]
+; CHECK:       for.body.peel:
+; CHECK-NEXT:    [[COND_PEEL:%.*]] = tail call i32 @llvm.umax.i32(i32 0, i32 2)
+; CHECK-NEXT:    tail call void @foo(i32 [[COND_PEEL]])
+; CHECK-NEXT:    [[INC_PEEL:%.*]] = add nuw i32 0, 1
+; CHECK-NEXT:    [[EXITCOND_NOT_PEEL:%.*]] = icmp eq i32 [[INC_PEEL]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_PEEL]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_PEEL_NEXT:%.*]]
+; CHECK:       for.body.peel.next:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL2:%.*]]
+; CHECK:       for.body.peel2:
+; CHECK-NEXT:    [[COND_PEEL3:%.*]] = tail call i32 @llvm.umax.i32(i32 [[INC_PEEL]], i32 2)
+; CHECK-NEXT:    tail call void @foo(i32 [[COND_PEEL3]])
+; CHECK-NEXT:    [[INC_PEEL4:%.*]] = add nuw i32 [[INC_PEEL]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_PEEL5:%.*]] = icmp eq i32 [[INC_PEEL4]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_PEEL5]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY_PEEL_NEXT1:%.*]]
+; CHECK:       for.body.peel.next1:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL_NEXT6:%.*]]
+; CHECK:       for.body.peel.next6:
+; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER_PEEL_NEWPH:%.*]]
+; CHECK:       for.body.preheader.peel.newph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[COND:%.*]] = tail call i32 @llvm.umax.i32(i32 [[I_06]], i32 2)
-; CHECK-NEXT:    tail call void @foo(i32 [[COND]])
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[INC_PEEL4]], [[FOR_BODY_PREHEADER_PEEL_NEWPH]] ]
+; CHECK-NEXT:    tail call void @foo(i32 [[I_06]])
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_06]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
@@ -91,14 +137,37 @@ define void @test3(i32 %N) {
 ; CHECK-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL_BEGIN:%.*]]
+; CHECK:       for.body.peel.begin:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL:%.*]]
+; CHECK:       for.body.peel:
+; CHECK-NEXT:    [[COND_PEEL:%.*]] = tail call i32 @llvm.smax.i32(i32 0, i32 -2)
+; CHECK-NEXT:    tail call void @foo(i32 [[COND_PEEL]])
+; CHECK-NEXT:    [[DEC_PEEL:%.*]] = add nsw i32 0, -1
+; CHECK-NEXT:    [[CMP_PEEL:%.*]] = icmp sgt i32 [[DEC_PEEL]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_PEEL]], label [[FOR_BODY_PEEL_NEXT:%.*]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.body.peel.next:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL2:%.*]]
+; CHECK:       for.body.peel2:
+; CHECK-NEXT:    [[COND_PEEL3:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DEC_PEEL]], i32 -2)
+; CHECK-NEXT:    tail call void @foo(i32 [[COND_PEEL3]])
+; CHECK-NEXT:    [[DEC_PEEL4:%.*]] = add nsw i32 [[DEC_PEEL]], -1
+; CHECK-NEXT:    [[CMP_PEEL5:%.*]] = icmp sgt i32 [[DEC_PEEL4]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_PEEL5]], label [[FOR_BODY_PEEL_NEXT1:%.*]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       for.body.peel.next1:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL_NEXT6:%.*]]
+; CHECK:       for.body.peel.next6:
+; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER_PEEL_NEWPH:%.*]]
+; CHECK:       for.body.preheader.peel.newph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[COND:%.*]] = tail call i32 @llvm.smax.i32(i32 [[I_06]], i32 -2)
-; CHECK-NEXT:    tail call void @foo(i32 [[COND]])
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_BODY]] ], [ [[DEC_PEEL4]], [[FOR_BODY_PREHEADER_PEEL_NEWPH]] ]
+; CHECK-NEXT:    tail call void @foo(i32 -2)
 ; CHECK-NEXT:    [[DEC]] = add nsw i32 [[I_06]], -1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[DEC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
@@ -127,14 +196,37 @@ define void @test4(i32 %N) {
 ; CHECK-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL_BEGIN:%.*]]
+; CHECK:       for.body.peel.begin:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL:%.*]]
+; CHECK:       for.body.peel:
+; CHECK-NEXT:    [[COND_PEEL:%.*]] = tail call i32 @llvm.smin.i32(i32 0, i32 -2)
+; CHECK-NEXT:    tail call void @foo(i32 noundef signext [[COND_PEEL]])
+; CHECK-NEXT:    [[DEC_PEEL:%.*]] = add nsw i32 0, -1
+; CHECK-NEXT:    [[CMP_PEEL:%.*]] = icmp sgt i32 [[DEC_PEEL]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_PEEL]], label [[FOR_BODY_PEEL_NEXT:%.*]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.body.peel.next:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL2:%.*]]
+; CHECK:       for.body.peel2:
+; CHECK-NEXT:    [[COND_PEEL3:%.*]] = tail call i32 @llvm.smin.i32(i32 [[DEC_PEEL]], i32 -2)
+; CHECK-NEXT:    tail call void @foo(i32 noundef signext [[COND_PEEL3]])
+; CHECK-NEXT:    [[DEC_PEEL4:%.*]] = add nsw i32 [[DEC_PEEL]], -1
+; CHECK-NEXT:    [[CMP_PEEL5:%.*]] = icmp sgt i32 [[DEC_PEEL4]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_PEEL5]], label [[FOR_BODY_PEEL_NEXT1:%.*]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       for.body.peel.next1:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL_NEXT6:%.*]]
+; CHECK:       for.body.peel.next6:
+; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER_PEEL_NEWPH:%.*]]
+; CHECK:       for.body.preheader.peel.newph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[COND:%.*]] = tail call i32 @llvm.smin.i32(i32 [[I_06]], i32 -2)
-; CHECK-NEXT:    tail call void @foo(i32 noundef signext [[COND]])
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_BODY]] ], [ [[DEC_PEEL4]], [[FOR_BODY_PREHEADER_PEEL_NEWPH]] ]
+; CHECK-NEXT:    tail call void @foo(i32 noundef signext [[I_06]])
 ; CHECK-NEXT:    [[DEC]] = add nsw i32 [[I_06]], -1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[DEC]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
@@ -200,13 +292,37 @@ define void @test_wrap(i8 %N) {
 ; CHECK-NEXT:    br i1 [[CMP5_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_06:%.*]] = phi i8 [ [[INC1:%.*]], [[FOR_BODY1]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK:       for.body.peel.begin:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL:%.*]]
+; CHECK:       for.body.peel:
+; CHECK-NEXT:    [[COND_PEEL:%.*]] = tail call i8 @llvm.umin.i8(i8 0, i8 -2)
+; CHECK-NEXT:    tail call void @bar(i8 [[COND_PEEL]])
+; CHECK-NEXT:    [[I_06:%.*]] = add i8 0, 127
+; CHECK-NEXT:    [[EXITCOND_NOT_PEEL:%.*]] = icmp eq i8 [[I_06]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_PEEL]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_PEEL_NEXT:%.*]]
+; CHECK:       for.body.peel.next:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL2:%.*]]
+; CHECK:       for.body.peel2:
 ; CHECK-NEXT:    [[COND1:%.*]] = tail call i8 @llvm.umin.i8(i8 [[I_06]], i8 -2)
 ; CHECK-NEXT:    tail call void @bar(i8 [[COND1]])
-; CHECK-NEXT:    [[INC1]] = add i8 [[I_06]], 127
+; CHECK-NEXT:    [[INC1:%.*]] = add i8 [[I_06]], 127
 ; CHECK-NEXT:    [[EXITCOND_NOT1:%.*]] = icmp eq i8 [[INC1]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT1]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY1]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT1]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY_PEEL_NEXT1:%.*]]
+; CHECK:       for.body.peel.next1:
+; CHECK-NEXT:    br label [[FOR_BODY_PEEL_NEXT6:%.*]]
+; CHECK:       for.body.peel.next6:
+; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER_PEEL_NEWPH:%.*]]
+; CHECK:       for.body.preheader.peel.newph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_6:%.*]] = phi i8 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[INC1]], [[FOR_BODY_PREHEADER_PEEL_NEWPH]] ]
+; CHECK-NEXT:    [[COND:%.*]] = tail call i8 @llvm.umin.i8(i8 [[I_6]], i8 -2)
+; CHECK-NEXT:    tail call void @bar(i8 [[COND]])
+; CHECK-NEXT:    [[INC]] = add i8 [[I_6]], 127
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i8 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
@@ -227,3 +343,11 @@ for.body:
 for.cond.cleanup:
   ret void
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.peeled.count", i32 2}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+;.

>From 963110a628cc933e9b7891a23329b2333af064cd Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Thu, 23 May 2024 14:52:15 +0300
Subject: [PATCH 3/5] Review corrections

---
 llvm/lib/Transforms/Utils/LoopPeel.cpp | 37 +++++++++++---------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index a6d37b9bdf11d..d2686f0da1139 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -351,6 +351,16 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     MaxPeelCount =
         std::min((unsigned)SC->getAPInt().getLimitedValue() - 1, MaxPeelCount);
 
+  auto PeelWhilePredicateIsKnown =
+      [&](unsigned &PeelCount, const SCEV *&IterVal, const SCEV *BoundSCEV,
+          const SCEV *Step, ICmpInst::Predicate Pred) {
+        while (PeelCount < MaxPeelCount &&
+               SE.isKnownPredicate(Pred, IterVal, BoundSCEV)) {
+          IterVal = SE.getAddExpr(IterVal, Step);
+          ++PeelCount;
+        }
+      };
+
   const unsigned MaxDepth = 4;
   std::function<void(Value *, unsigned)> ComputePeelCount =
       [&](Value *Condition, unsigned Depth) -> void {
@@ -411,21 +421,7 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
       Pred = ICmpInst::getInversePredicate(Pred);
 
     const SCEV *Step = LeftAR->getStepRecurrence(SE);
-    const SCEV *NextIterVal = SE.getAddExpr(IterVal, Step);
-    auto PeelOneMoreIteration = [&IterVal, &NextIterVal, &SE, Step,
-                                 &NewPeelCount]() {
-      IterVal = NextIterVal;
-      NextIterVal = SE.getAddExpr(IterVal, Step);
-      NewPeelCount++;
-    };
-
-    auto CanPeelOneMoreIteration = [&NewPeelCount, &MaxPeelCount]() {
-      return NewPeelCount < MaxPeelCount;
-    };
-
-    while (CanPeelOneMoreIteration() &&
-           SE.isKnownPredicate(Pred, IterVal, RightSCEV))
-      PeelOneMoreIteration();
+    PeelWhilePredicateIsKnown(NewPeelCount, IterVal, RightSCEV, Step, Pred);
 
     // With *that* peel count, does the predicate !Pred become known in the
     // first iteration of the loop body after peeling?
@@ -436,14 +432,15 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     // However, for equality comparisons, that isn't always sufficient to
     // eliminate the comparsion in loop body, we may need to peel one more
     // iteration. See if that makes !Pred become unknown again.
+    const SCEV *NextIterVal = SE.getAddExpr(IterVal, Step);
     if (ICmpInst::isEquality(Pred) &&
         !SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), NextIterVal,
                              RightSCEV) &&
         !SE.isKnownPredicate(Pred, IterVal, RightSCEV) &&
         SE.isKnownPredicate(Pred, NextIterVal, RightSCEV)) {
-      if (!CanPeelOneMoreIteration())
+      if (NewPeelCount >= MaxPeelCount)
         return; // Need to peel one more iteration, but can't. Give up.
-      PeelOneMoreIteration(); // Great!
+      ++NewPeelCount; // Great!
     }
 
     DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount);
@@ -489,11 +486,7 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
       return;
     const SCEV *IterVal = AddRec->evaluateAtIteration(
         SE.getConstant(AddRec->getType(), DesiredPeelCount), SE);
-    while (DesiredPeelCount < MaxPeelCount &&
-           SE.isKnownPredicate(Pred, IterVal, BoundSCEV)) {
-      IterVal = SE.getAddExpr(IterVal, Step);
-      ++DesiredPeelCount;
-    }
+    PeelWhilePredicateIsKnown(DesiredPeelCount, IterVal, BoundSCEV, Step, Pred);
   };
 
   for (BasicBlock *BB : L.blocks()) {

>From f7c88cdac53259f5e21baea7cba9df9197785cd7 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Tue, 28 May 2024 18:37:04 +0300
Subject: [PATCH 4/5] Add review corrections

---
 llvm/lib/Transforms/Utils/LoopPeel.cpp | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index d2686f0da1139..6c7a0b1e1c2a0 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -446,21 +446,8 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount);
   };
 
-  auto ComputePeelCountMinMax = [&](IntrinsicInst *II) {
-    bool IsSigned;
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::smax:
-    case Intrinsic::smin:
-      IsSigned = true;
-      break;
-    case Intrinsic::umax:
-    case Intrinsic::umin:
-      IsSigned = false;
-      break;
-    default:
-      return;
-    }
-    Value *LHS = II->getOperand(0), *RHS = II->getOperand(1);
+  auto ComputePeelCountMinMax = [&](MinMaxIntrinsic *MinMax) {
+    Value *LHS = MinMax->getLHS(), *RHS = MinMax->getRHS();
     const SCEV *BoundSCEV, *IterSCEV;
     if (L.isLoopInvariant(LHS)) {
       BoundSCEV = SE.getSCEV(LHS);
@@ -479,9 +466,9 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     // predicates here.
     ICmpInst::Predicate Pred;
     if (SE.isKnownPositive(Step))
-      Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+      Pred = MinMax->isSigned() ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
     else if (SE.isKnownNegative(Step))
-      Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+      Pred = MinMax->isSigned() ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
     else
       return;
     const SCEV *IterVal = AddRec->evaluateAtIteration(
@@ -493,8 +480,8 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     for (Instruction &I : *BB) {
       if (SelectInst *SI = dyn_cast<SelectInst>(&I))
         ComputePeelCount(SI->getCondition(), 0);
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
-        ComputePeelCountMinMax(II);
+      if (MinMaxIntrinsic *MinMax = dyn_cast<MinMaxIntrinsic>(&I))
+        ComputePeelCountMinMax(MinMax);
     }
 
     auto *BI = dyn_cast<BranchInst>(BB->getTerminator());

>From 72c0869d55a3b8e4cfc6e84656b64a8e77d833ba Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Wed, 29 May 2024 12:31:45 +0300
Subject: [PATCH 5/5] Do not peel wrapping AddRec

---
 llvm/lib/Transforms/Utils/LoopPeel.cpp        |  8 +++--
 .../peel-loop-min-max-intrinsics.ll           | 29 ++-----------------
 2 files changed, 8 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 6c7a0b1e1c2a0..d25623bbd1942 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -462,15 +462,19 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     if (!AddRec || !AddRec->isAffine() || AddRec->getLoop() != &L)
       return;
     const SCEV *Step = AddRec->getStepRecurrence(SE);
+    bool IsSigned = MinMax->isSigned();
     // To minimize number of peeled iterations, we use strict relational
     // predicates here.
     ICmpInst::Predicate Pred;
     if (SE.isKnownPositive(Step))
-      Pred = MinMax->isSigned() ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+      Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
     else if (SE.isKnownNegative(Step))
-      Pred = MinMax->isSigned() ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+      Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
     else
       return;
+    // Check that AddRec is not wrapping.
+    if (!(IsSigned ? AddRec->hasNoSignedWrap() : AddRec->hasNoUnsignedWrap()))
+      return;
     const SCEV *IterVal = AddRec->evaluateAtIteration(
         SE.getConstant(AddRec->getType(), DesiredPeelCount), SE);
     PeelWhilePredicateIsKnown(DesiredPeelCount, IterVal, BoundSCEV, Step, Pred);
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll
index b990067e398dc..65de0b714a9fb 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll
@@ -291,38 +291,14 @@ define void @test_wrap(i8 %N) {
 ; CHECK-NEXT:    [[CMP5_NOT:%.*]] = icmp eq i8 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP5_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
-; CHECK:       for.body.peel.begin:
-; CHECK-NEXT:    br label [[FOR_BODY_PEEL:%.*]]
-; CHECK:       for.body.peel:
-; CHECK-NEXT:    [[COND_PEEL:%.*]] = tail call i8 @llvm.umin.i8(i8 0, i8 -2)
-; CHECK-NEXT:    tail call void @bar(i8 [[COND_PEEL]])
-; CHECK-NEXT:    [[I_06:%.*]] = add i8 0, 127
-; CHECK-NEXT:    [[EXITCOND_NOT_PEEL:%.*]] = icmp eq i8 [[I_06]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_PEEL]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_PEEL_NEXT:%.*]]
-; CHECK:       for.body.peel.next:
-; CHECK-NEXT:    br label [[FOR_BODY_PEEL2:%.*]]
-; CHECK:       for.body.peel2:
-; CHECK-NEXT:    [[COND1:%.*]] = tail call i8 @llvm.umin.i8(i8 [[I_06]], i8 -2)
-; CHECK-NEXT:    tail call void @bar(i8 [[COND1]])
-; CHECK-NEXT:    [[INC1:%.*]] = add i8 [[I_06]], 127
-; CHECK-NEXT:    [[EXITCOND_NOT1:%.*]] = icmp eq i8 [[INC1]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT1]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY_PEEL_NEXT1:%.*]]
-; CHECK:       for.body.peel.next1:
-; CHECK-NEXT:    br label [[FOR_BODY_PEEL_NEXT6:%.*]]
-; CHECK:       for.body.peel.next6:
-; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER_PEEL_NEWPH:%.*]]
-; CHECK:       for.body.preheader.peel.newph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_6:%.*]] = phi i8 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[INC1]], [[FOR_BODY_PREHEADER_PEEL_NEWPH]] ]
+; CHECK-NEXT:    [[I_6:%.*]] = phi i8 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[COND:%.*]] = tail call i8 @llvm.umin.i8(i8 [[I_6]], i8 -2)
 ; CHECK-NEXT:    tail call void @bar(i8 [[COND]])
 ; CHECK-NEXT:    [[INC]] = add i8 [[I_6]], 127
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i8 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.cond.cleanup.loopexit.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
@@ -349,5 +325,4 @@ for.cond.cleanup:
 ; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
 ;.



More information about the llvm-commits mailing list