[llvm] [CGP] [CodeGenPrepare] Folding `urem` with loop invariant value plus offset (PR #104724)

Wed Aug 21 11:30:33 PDT 2024

https://github.com/goldsteinn updated https://github.com/llvm/llvm-project/pull/104724

>From ae15931b3fd7c0d456f4224accd1dc90df85d1da Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Wed, 21 Aug 2024 11:26:09 -0700
Subject: [PATCH 1/2] [CodeGenPrepare][X86] Add test for `or disjoint` offset;
 NFC

---
 llvm/test/CodeGen/X86/fold-loop-of-urem.ll    | 82 +++++++++++++++----
 .../CodeGenPrepare/X86/fold-loop-of-urem.ll   | 48 +++++++++--
 2 files changed, 110 insertions(+), 20 deletions(-)

diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
index c4b130a8b4e717..266a9d8411258c 100644
--- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -1245,8 +1245,8 @@ for.body:
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32 %N, i32 %rem_amt_in) nounwind {
-; CHECK-LABEL: simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw:
+define void @simple_urem_to_sel_non_zero_start_through_dis_or(i32 %N, i32 %rem_amt_in) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_non_zero_start_through_dis_or:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    cmpl $3, %edi
 ; CHECK-NEXT:    jb .LBB22_4
@@ -1258,7 +1258,7 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32
 ; CHECK-NEXT:    movl %edi, %r14d
 ; CHECK-NEXT:    orl $16, %ebx
 ; CHECK-NEXT:    negl %r14d
-; CHECK-NEXT:    movl $7, %r15d
+; CHECK-NEXT:    movl $10, %r15d
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB22_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1270,7 +1270,7 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32
 ; CHECK-NEXT:    leal 1(%r14,%r15), %eax
 ; CHECK-NEXT:    movl %r15d, %ecx
 ; CHECK-NEXT:    incl %ecx
-; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    cmpl $8, %eax
 ; CHECK-NEXT:    movl %ecx, %r15d
 ; CHECK-NEXT:    jne .LBB22_2
 ; CHECK-NEXT:  # %bb.3:
@@ -1289,7 +1289,7 @@ for.cond.cleanup:
 
 for.body:
   %i.04 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
-  %i_with_off = add i32 %i.04, 5
+  %i_with_off = or disjoint i32 %i.04, 8
   %rem = urem i32 %i_with_off, %rem_amt
   tail call void @use.i32(i32 %rem)
   %inc = add nuw i32 %i.04, 1
@@ -1297,8 +1297,8 @@ for.body:
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem(i32 %N, i32 %rem_amt) nounwind {
-; CHECK-LABEL: simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem:
+define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32 %N, i32 %rem_amt_in) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    cmpl $3, %edi
 ; CHECK-NEXT:    jb .LBB23_4
@@ -1308,6 +1308,7 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem(
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    orl $16, %ebx
 ; CHECK-NEXT:    negl %r14d
 ; CHECK-NEXT:    movl $7, %r15d
 ; CHECK-NEXT:    .p2align 4, 0x90
@@ -1330,6 +1331,57 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem(
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:  .LBB23_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
+entry:
+  %rem_amt = or i32 %rem_amt_in, 16
+  %cmp3.not = icmp ult i32 %N, 3
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %i_with_off = add i32 %i.04, 5
+  %rem = urem i32 %i_with_off, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $3, %edi
+; CHECK-NEXT:    jb .LBB24_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    negl %r14d
+; CHECK-NEXT:    movl $7, %r15d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB24_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    leal 1(%r14,%r15), %eax
+; CHECK-NEXT:    movl %r15d, %ecx
+; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    movl %ecx, %r15d
+; CHECK-NEXT:    jne .LBB24_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:  .LBB24_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
 entry:
   %cmp3.not = icmp ult i32 %N, 3
   br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
@@ -1357,14 +1409,14 @@ define void @simple_urem_to_sel_non_zero_start_through_sub(i32 %N, i32 %rem_amt,
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %edi, %ebp
 ; CHECK-NEXT:    subl %edx, %ebp
-; CHECK-NEXT:    jbe .LBB24_3
+; CHECK-NEXT:    jbe .LBB25_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    xorl %r15d, %r15d
 ; CHECK-NEXT:    xorl %r14d, %r14d
 ; CHECK-NEXT:    xorl %r12d, %r12d
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB24_2: # %for.body
+; CHECK-NEXT:  .LBB25_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movl %r14d, %edi
 ; CHECK-NEXT:    callq use.i32 at PLT
@@ -1373,8 +1425,8 @@ define void @simple_urem_to_sel_non_zero_start_through_sub(i32 %N, i32 %rem_amt,
 ; CHECK-NEXT:    cmovel %r15d, %r14d
 ; CHECK-NEXT:    incl %r12d
 ; CHECK-NEXT:    cmpl %r12d, %ebp
-; CHECK-NEXT:    jne .LBB24_2
-; CHECK-NEXT:  .LBB24_3: # %for.cond.cleanup
+; CHECK-NEXT:    jne .LBB25_2
+; CHECK-NEXT:  .LBB25_3: # %for.cond.cleanup
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r12
 ; CHECK-NEXT:    popq %r14
@@ -1402,7 +1454,7 @@ define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i3
 ; CHECK-LABEL: simple_urem_to_sel_non_zero_start_through_sub_no_simplfy:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    cmpl %edx, %edi
-; CHECK-NEXT:    jbe .LBB25_4
+; CHECK-NEXT:    jbe .LBB26_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
@@ -1413,7 +1465,7 @@ define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i3
 ; CHECK-NEXT:    negl %r14d
 ; CHECK-NEXT:    addl $-2, %r15d
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB25_2: # %for.body
+; CHECK-NEXT:  .LBB26_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movl %r15d, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
@@ -1425,12 +1477,12 @@ define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i3
 ; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    cmpl $-2, %eax
 ; CHECK-NEXT:    movl %ecx, %r15d
-; CHECK-NEXT:    jne .LBB25_2
+; CHECK-NEXT:    jne .LBB26_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    popq %r15
-; CHECK-NEXT:  .LBB25_4: # %for.cond.cleanup
+; CHECK-NEXT:  .LBB26_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
 entry:
   %cmp3.not = icmp ule i32 %N, %start
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
index abfbf2e5e582eb..380fcc42a4cad9 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
@@ -319,20 +319,20 @@ for.body.tail:
 define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
 ; CHECK-LABEL: define void @simple_urem_to_sel_vec(
 ; CHECK-SAME: <2 x i64> [[REM_AMT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[ENTRY:.*]]:
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[FOR_COND_CLEANUP]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[FOR_COND_CLEANUP]] ]
+; CHECK-NEXT:    [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[ENTRY]] ]
 ; CHECK-NEXT:    tail call void @use.2xi64(<2 x i64> [[REM]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw <2 x i64> [[REM]], <i64 1, i64 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], [[REM_AMT]]
 ; CHECK-NEXT:    [[TMP3]] = select <2 x i1> [[TMP2]], <2 x i64> zeroinitializer, <2 x i64> [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw <2 x i64> [[I_04]], <i64 1, i64 1>
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = call i1 @get.i1()
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[ENTRY]], label %[[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -918,6 +918,44 @@ for.body:
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+define void @simple_urem_to_sel_non_zero_start_through_dis_or(i32 %N, i32 %rem_amt_in) nounwind {
+; CHECK-LABEL: define void @simple_urem_to_sel_non_zero_start_through_dis_or(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT_IN:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[REM_AMT:%.*]] = or i32 [[REM_AMT_IN]], 16
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ult i32 [[N]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = or disjoint i32 [[I_04]], 8
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %rem_amt = or i32 %rem_amt_in, 16
+  %cmp3.not = icmp ult i32 %N, 3
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %i_with_off = or disjoint i32 %i.04, 8
+  %rem = urem i32 %i_with_off, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
 define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32 %N, i32 %rem_amt_in) nounwind {
 ; CHECK-LABEL: define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(
 ; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT_IN:%.*]]) #[[ATTR0]] {

>From 191df4d9e36b5642d82de89cbdc6d64af555596a Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Sun, 18 Aug 2024 16:12:15 -0700
Subject: [PATCH 2/2] [CGP][CodeGenPrepare] Folding `urem` with loop invariant
 value plus offset

This extends the existing fold:

```
for(i = Start; i < End; ++i)
   Rem = (i nuw+- IncrLoopInvariant) u% RemAmtLoopInvariant;
```
 ->
```
Rem = (Start nuw+- IncrLoopInvariant) % RemAmtLoopInvariant;
for(i = Start; i < End; ++i, ++rem)
   Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
```

To work with a non-zero `IncrLoopInvariant`.

This is a common usage in cases such as:

```
for(i = 0; i < N; ++i)
    if ((i + 1) % X) == 0)
        do_something_occasionally_but_not_first_iter();
```

Alive2 w/ i4/unrolled 6x (needs to be ran locally due to timeout):
https://alive2.llvm.org/ce/z/6tgyN3

Exhaust proof over all uint8_t combinations in C++:
https://godbolt.org/z/WYa561388
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 83 +++++++++++++++++--
 .../CodeGenPrepare/X86/fold-loop-of-urem.ll   | 18 ++--
 2 files changed, 87 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index bf48c1fdab0ff0..d37682fa1c88fd 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1976,17 +1976,43 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
   return true;
 }
 
-static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
-                                                  const LoopInfo *LI,
-                                                  Value *&RemAmtOut,
-                                                  PHINode *&LoopIncrPNOut) {
+static bool isRemOfLoopIncrementWithLoopInvariant(
+    Instruction *Rem, const LoopInfo *LI, Value *&RemAmtOut, bool &AddOrSubOut,
+    Value *&AddOrSubInstOut, Value *&AddOrSubOffsetOut,
+    PHINode *&LoopIncrPNOut) {
   Value *Incr, *RemAmt;
   // NB: If RemAmt is a power of 2 it *should* have been transformed by now.
   if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt))))
     return false;
 
+  bool AddOrSub = false;
+  Value *AddOrSubOffset;
   // Find out loop increment PHI.
   auto *PN = dyn_cast<PHINode>(Incr);
+  if (PN != nullptr) {
+    AddOrSub = false;
+    AddOrSubOffset = nullptr;
+  } else {
+    // Search through a NUW add/sub on top of the loop increment.
+    Value *V0, *V1;
+    bool Add = match(Incr, m_NUWAddLike(m_Value(V0), m_Value(V1)));
+    bool Sub = match(Incr, m_NUWSub(m_Value(V0), m_Value(V1)));
+    if (!Add && !Sub)
+      return false;
+
+    AddOrSub = true;
+
+    AddOrSubInstOut = Incr;
+
+    PN = dyn_cast<PHINode>(V0);
+    if (PN != nullptr) {
+      AddOrSubOffset = V1;
+    } else if (Add) {
+      PN = dyn_cast<PHINode>(V1);
+      AddOrSubOffset = V0;
+    }
+  }
+
   if (!PN)
     return false;
 
@@ -2026,6 +2052,8 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
   // Set output variables.
   RemAmtOut = RemAmt;
   LoopIncrPNOut = PN;
+  AddOrSubOut = AddOrSub;
+  AddOrSubOffsetOut = AddOrSubOffset;
 
   return true;
 }
@@ -2040,15 +2068,15 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
 // Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
 // for(i = Start; i < End; ++i, ++rem)
 //    Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
-//
-// Currently only implemented for `IncrLoopInvariant` being zero.
 static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
                                     const LoopInfo *LI,
                                     SmallSet<BasicBlock *, 32> &FreshBBs,
                                     bool IsHuge) {
-  Value *RemAmt;
+  bool AddOrSub;
+  Value *AddOrSubOffset, *RemAmt, *AddOrSubInst;
   PHINode *LoopIncrPN;
-  if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, LoopIncrPN))
+  if (!isRemOfLoopIncrementWithLoopInvariant(
+          Rem, LI, RemAmt, AddOrSub, AddOrSubInst, AddOrSubOffset, LoopIncrPN))
     return false;
 
   // Only non-constant remainder as the extra IV is probably not profitable
@@ -2066,6 +2094,43 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
 
   Loop *L = LI->getLoopFor(LoopIncrPN->getParent());
   Value *Start = LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader());
+  // If we have add/sub create initial value for remainder.
+  // The logic here is:
+  // (urem (add/sub nuw Start, IncrLoopInvariant), RemAmtLoopInvariant
+  //
+  // Only proceed if the expression simplifies (otherwise we can't fully
+  // optimize out the urem).
+  if (AddOrSub) {
+    assert(AddOrSubOffset && AddOrSubInst &&
+           "We found an add/sub but missing values");
+    // Without dom-condition/assumption cache we aren't likely to get much out
+    // of a context instruction.
+    const SimplifyQuery Q(*DL);
+    Instruction::BinaryOps Opc =
+        cast<BinaryOperator>(AddOrSubInst)->getOpcode();
+    switch (Opc) {
+    case Instruction::Add:
+      Start =
+          simplifyAddInst(Start, AddOrSubOffset,
+                          match(AddOrSubInst, m_NSWAdd(m_Value(), m_Value())),
+                          /*IsNUW=*/true, Q);
+      break;
+    case Instruction::Sub:
+      Start =
+          simplifySubInst(Start, AddOrSubOffset,
+                          match(AddOrSubInst, m_NSWSub(m_Value(), m_Value())),
+                          /*IsNUW=*/true, Q);
+      break;
+    case Instruction::Or:
+      Start = simplifyOrInst(Start, AddOrSubOffset, Q);
+      break;
+    default:
+      llvm_unreachable("Unknown offset instruction");
+    }
+    if (!Start)
+      return false;
+  }
+
   // If we can't fully optimize out the `rem`, skip this transform.
   Start = simplifyURemInst(Start, RemAmt, *DL);
   if (!Start)
@@ -2096,6 +2161,8 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
 
   replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge);
   Rem->eraseFromParent();
+  if (AddOrSubInst && AddOrSubInst->use_empty())
+    cast<Instruction>(AddOrSubInst)->eraseFromParent();
   return true;
 }
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
index 380fcc42a4cad9..acce88ce527584 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
@@ -892,10 +892,12 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ 7, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = add nuw i32 [[I_04]], 5
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
@@ -930,10 +932,12 @@ define void @simple_urem_to_sel_non_zero_start_through_dis_or(i32 %N, i32 %rem_a
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ 10, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = or disjoint i32 [[I_04]], 8
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
@@ -1041,10 +1045,12 @@ define void @simple_urem_to_sel_non_zero_start_through_sub(i32 %N, i32 %rem_amt,
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = sub nuw i32 [[I_04]], [[START]]
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]