[llvm] goldsteinn/cgp urem of liv (PR #96625)

Tue Jun 25 05:09:34 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: None (goldsteinn)

<details>
<summary>Changes</summary>

- **[CodeGenPrepare][X86] Add tests for folding `urem` with loop invariant value; NFC**
- **[CodeGenPrepare] Folding `urem` with loop invariant value**


---

Patch is 34.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96625.diff


2 Files Affected:

- (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+157) 
- (added) llvm/test/CodeGen/X86/fold-loop-of-urem.ll (+864) 


``````````diff

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 900c33b580f15..d2cc3b4cb326c 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -471,6 +471,7 @@ class CodeGenPrepare {
   bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, Value *Arg1,
                                    CmpInst *Cmp, Intrinsic::ID IID);
   bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT);
+  bool optimizeRem(Instruction *Rem);
   bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   void verifyBFIUpdates(Function &F);
@@ -1974,6 +1975,157 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
   return true;
 }
 
+static bool isRemOfLoopIncrementWithLIV(Value *Rem, const LoopInfo *LI,
+                                        Value *&RemAmtOut,
+                                        std::optional<bool> &AddOrSubOut,
+                                        Value *&AddOrSubOffsetOut,
+                                        PHINode *&LoopIncrPNOut) {
+  Value *Incr, *RemAmt;
+  if (!isa<Instruction>(Rem))
+    return false;
+  // NB: If RemAmt is a power of 2 it *should* have been transformed by now.
+  if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt))))
+    return false;
+
+  // Only trivially analyzable loops.
+  Loop *L = LI->getLoopFor(cast<Instruction>(Rem)->getParent());
+  if (L == nullptr || L->getLoopPreheader() == nullptr ||
+      L->getLoopLatch() == nullptr)
+    return false;
+
+  std::optional<bool> AddOrSub;
+  Value *AddOrSubOffset;
+  // Find out loop increment PHI.
+  PHINode *PN = dyn_cast<PHINode>(Incr);
+  if (PN != nullptr) {
+    AddOrSub = std::nullopt;
+    AddOrSubOffset = nullptr;
+  } else {
+    // Search through a NUW add/sub.
+    Value *V0, *V1;
+    if (match(Incr, m_NUWAddLike(m_Value(V0), m_Value(V1))))
+      AddOrSub = true;
+    else if (match(Incr, m_NUWSub(m_Value(V0), m_Value(V1))))
+      AddOrSub = false;
+    else
+      return false;
+
+    PN = dyn_cast<PHINode>(V0);
+    if (PN != nullptr) {
+      AddOrSubOffset = V1;
+    } else if (*AddOrSub) {
+      PN = dyn_cast<PHINode>(V1);
+      AddOrSubOffset = V0;
+    }
+  }
+
+  if (PN == nullptr)
+    return false;
+
+  // This isn't strictly necessary, what we really need is one increment and any
+  // amount of initial values all being the same.
+  if (PN->getNumIncomingValues() != 2)
+    return false;
+
+  // Only works if the remainder amount is a loop invaraint
+  if (!L->isLoopInvariant(RemAmt))
+    return false;
+
+  // Is the PHI a loop increment?
+  auto LoopIncrInfo = getIVIncrement(PN, LI);
+  if (!LoopIncrInfo.has_value())
+    return false;
+
+  // We need remainder_amount % increment_amount to be zero. Increment of one
+  // satisfies that without any special logic and is overwhelmingly the common
+  // case.
+  if (!match(LoopIncrInfo->second, m_One()))
+    return false;
+
+  // Need the increment to not overflow.
+  if (!match(LoopIncrInfo->first, m_NUWAdd(m_Value(), m_Value())))
+    return false;
+
+  // Set output variables.
+  RemAmtOut = RemAmt;
+  LoopIncrPNOut = PN;
+  AddOrSubOut = AddOrSub;
+  AddOrSubOffsetOut = AddOrSubOffset;
+
+  return true;
+}
+
+// Try to transform:
+//
+// for(i = Start; i < End; ++i)
+//    Rem = (i nuw+ IncrLoopInvariant) u% RemAmtLoopInvariant;
+//
+// ->
+//
+// Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
+// for(i = Start; i < End; ++i, ++rem)
+//    Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
+//
+// Currently only implemented for `Start` and `IncrLoopInvariant` being zero.
+static bool foldURemOfLoopIncrement(Instruction *Rem, const LoopInfo *LI,
+                                   SmallSet<BasicBlock *, 32> &FreshBBs,
+                                   bool IsHuge) {
+  std::optional<bool> AddOrSub;
+  Value *AddOrSubOffset, *RemAmt;
+  PHINode *LoopIncrPN;
+  if (!isRemOfLoopIncrementWithLIV(Rem, LI, RemAmt, AddOrSub, AddOrSubOffset,
+                                   LoopIncrPN))
+    return false;
+
+  // Only non-constant remainder as the extra IV is is probably not profitable
+  // in that case. Further, since remainder amount is non-constant, only handle
+  // case where `IncrLoopInvariant` and `Start` are 0 to entirely eliminate the
+  // rem (as opposed to just hoisting it outside of the loop).
+  //
+  // Potential TODO: Should we have a check for how "nested" this remainder
+  // operation is? The new code runs every iteration so if the remainder is
+  // guarded behind unlikely conditions this might not be worth it.
+  if (AddOrSub.has_value() || match(RemAmt, m_ImmConstant()))
+    return false;
+  Loop *L = LI->getLoopFor(Rem->getParent());
+  if (!match(LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader()),
+             m_Zero()))
+    return false;
+
+  // Create new remainder with induction variable.
+  Type *Ty = Rem->getType();
+  IRBuilder<> Builder(Rem->getContext());
+
+  Builder.SetInsertPoint(LoopIncrPN);
+  PHINode *NewRem = Builder.CreatePHI(Ty, 2);
+
+  Builder.SetInsertPoint(cast<Instruction>(
+      LoopIncrPN->getIncomingValueForBlock(L->getLoopLatch())));
+  // `(add (urem x, y), 1)` is always nuw.
+  Value *RemAdd = Builder.CreateNUWAdd(NewRem, ConstantInt::get(Ty, 1));
+  Value *RemCmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, RemAdd, RemAmt);
+  Value *RemSel =
+      Builder.CreateSelect(RemCmp, Constant::getNullValue(Ty), RemAdd);
+
+  NewRem->addIncoming(Constant::getNullValue(Ty), L->getLoopPreheader());
+  NewRem->addIncoming(RemSel, L->getLoopLatch());
+
+  // Insert all touched BBs.
+  FreshBBs.insert(LoopIncrPN->getParent());
+  FreshBBs.insert(L->getLoopLatch());
+  FreshBBs.insert(Rem->getParent());
+
+  replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge);
+  Rem->eraseFromParent();
+  return true;
+}
+
+bool CodeGenPrepare::optimizeRem(Instruction *Rem) {
+  if (foldURemOfLoopIncrement(Rem, LI, FreshBBs, IsHugeFunc))
+    return true;
+  return false;
+}
+
 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (sinkCmpExpression(Cmp, *TLI))
     return true;
@@ -8360,6 +8512,11 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
     if (optimizeCmp(Cmp, ModifiedDT))
       return true;
 
+  if (match(I, m_URem(m_Value(), m_Value())) ||
+      match(I, m_SRem(m_Value(), m_Value())))
+    if (optimizeRem(I))
+      return true;
+
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
     bool Modified = optimizeLoadExt(LI);
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
new file mode 100644
index 0000000000000..9b093ec259201
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -0,0 +1,864 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare void @use.i32(i32)
+declare void @use.2xi64(<2 x i64>)
+declare void @do_stuff0()
+declare void @do_stuff1()
+declare i1 @get.i1()
+declare i32 @get.i32()
+
+define void @simple_urem_to_sel(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB0_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r15d, %r15d
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r12d, %r12d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %ebx, %r14d
+; CHECK-NEXT:    cmovel %r15d, %r14d
+; CHECK-NEXT:    incl %r12d
+; CHECK-NEXT:    cmpl %r12d, %ebp
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB0_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_nested2(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_nested2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB1_8
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r15d, %r15d
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r12d, %r12d
+; CHECK-NEXT:    jmp .LBB1_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB1_5: # %for.body1
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:  .LBB1_6: # %for.body.tail
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %ebx, %r14d
+; CHECK-NEXT:    cmovel %r15d, %r14d
+; CHECK-NEXT:    incl %r12d
+; CHECK-NEXT:    cmpl %r12d, %ebp
+; CHECK-NEXT:    je .LBB1_7
+; CHECK-NEXT:  .LBB1_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB1_6
+; CHECK-NEXT:  # %bb.3: # %for.body0
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB1_5
+; CHECK-NEXT:  # %bb.4: # %for.body2
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB1_5
+; CHECK-NEXT:    jmp .LBB1_6
+; CHECK-NEXT:  .LBB1_7:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB1_8: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body.tail ], [ 0, %entry ]
+  %cond0 = call i1 @get.i1()
+  br i1 %cond0, label %for.body0, label %for.body.tail
+for.body0:
+  %cond1 = call i1 @get.i1()
+  br i1 %cond1, label %for.body1, label %for.body2
+for.body2:
+  %cond2 = call i1 @get.i1()
+  br i1 %cond2, label %for.body1, label %for.body.tail
+for.body1:
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  br label %for.body.tail
+for.body.tail:
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr3(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_incr3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB2_9
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    jmp .LBB2_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB2_6: # %for.body1
+; CHECK-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:  .LBB2_7: # %for.body.tail
+; CHECK-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB2_8
+; CHECK-NEXT:  .LBB2_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB2_5
+; CHECK-NEXT:  # %bb.3: # %for.body0
+; CHECK-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    movl %eax, %r14d
+; CHECK-NEXT:    callq get.i32 at PLT
+; CHECK-NEXT:    testb $1, %r14b
+; CHECK-NEXT:    je .LBB2_7
+; CHECK-NEXT:  # %bb.4: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    movl %eax, %ebp
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    jmp .LBB2_6
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB2_5: # %for.body2
+; CHECK-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB2_6
+; CHECK-NEXT:    jmp .LBB2_7
+; CHECK-NEXT:  .LBB2_8:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB2_9: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %cond0 = call i1 @get.i1()
+  br i1 %cond0, label %for.body0, label %for.body2
+for.body0:
+  %cond1 = call i1 @get.i1()
+  %val = call i32 @get.i32()
+  %inc = add nuw i32 %val, 1
+  br i1 %cond1, label %for.body1, label %for.body.tail
+for.body2:
+  %cond2 = call i1 @get.i1()
+  br i1 %cond2, label %for.body1, label %for.body.tail
+for.body1:
+  %i.04 = phi i32 [ %inc, %for.body0], [ 0, %for.body2 ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  br label %for.body.tail
+for.body.tail:
+  %exitcond.not = call i1 @get.i1()
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_vec:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB3_1: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    callq use.2xi64 at PLT
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    movdqa (%rsp), %xmm2 # 16-byte Reload
+; CHECK-NEXT:    psubq %xmm1, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm2, %xmm3
+; CHECK-NEXT:    pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; CHECK-NEXT:    pand %xmm0, %xmm2
+; CHECK-NEXT:    pandn %xmm3, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    psubq %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    je .LBB3_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    retq
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi <2 x i64> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+  %rem = urem <2 x i64> %i.04, %rem_amt
+  tail call void @use.2xi64(<2 x i64> %rem)
+  %inc = add nuw <2 x i64> %i.04, <i64 1, i64 1>
+  %exitcond.not = call i1 @get.i1()
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_incr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB4_6
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    jmp .LBB4_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB4_4: # %for.body.tail
+; CHECK-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %ebp, %r14d
+; CHECK-NEXT:    je .LBB4_5
+; CHECK-NEXT:  .LBB4_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB4_4
+; CHECK-NEXT:  # %bb.3: # %for.body0
+; CHECK-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    callq get.i32 at PLT
+; CHECK-NEXT:    movl %eax, %r14d
+; CHECK-NEXT:    jmp .LBB4_4
+; CHECK-NEXT:  .LBB4_5:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB4_6: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.03 = phi i32 [ %inc, %for.body.tail ], [ 0, %entry ]
+  %cond0 = call i1 @get.i1()
+  br i1 %cond0, label %for.body0, label %for.body.tail
+for.body0:
+  %some_val = call i32 @get.i32()
+  br label %for.body.tail
+
+for.body.tail:
+  %i.04 = phi i32 [ %i.03, %for.body ], [ %some_val, %for.body0 ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_second_acc(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_second_acc:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    jb .LBB5_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    movl $1, %r15d
+; CHECK-NEXT:    xorl %r12d, %r12d
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r13d, %r13d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB5_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %ebx, %r14d
+; CHECK-NEXT:    cmovel %r12d, %r14d
+; CHECK-NEXT:    incl %r13d
+; CHECK-NEXT:    addl $2, %r15d
+; CHECK-NEXT:    cmpl %ebp, %r15d
+; CHECK-NEXT:    jbe .LBB5_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB5_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp ult i32 %N, 2
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %i.05 = phi i32 [ %inc2, %for.body ], [ 1, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %inc2 = add nuw i32 %i.05, 2
+  %exitcond.not = icmp ugt i32 %inc2, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_srem(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_srem:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB6_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/96625