[llvm] [CodeGenPrepare] Folding `urem` with loop invariant value as remainder (PR #96625)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 25 11:17:39 PDT 2024


https://github.com/goldsteinn updated https://github.com/llvm/llvm-project/pull/96625

>From 53916468e6ce9c9cac158c49771d322526025c97 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Tue, 25 Jun 2024 19:50:51 +0800
Subject: [PATCH 1/4] [CodeGenPrepare][X86] Add tests for folding `urem` with
 loop invariant value; NFC

---
 llvm/test/CodeGen/X86/fold-loop-of-urem.ll    | 1159 +++++++++++++++++
 .../CodeGenPrepare/X86/fold-loop-of-urem.ll   |  858 ++++++++++++
 2 files changed, 2017 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/fold-loop-of-urem.ll
 create mode 100644 llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll

diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
new file mode 100644
index 0000000000000..aad2e0dd7bd24
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -0,0 +1,1159 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare void @use.i32(i32)
+declare void @use.2xi64(<2 x i64>)
+declare void @do_stuff0()
+declare void @do_stuff1()
+declare i1 @get.i1()
+declare i32 @get.i32()
+
+define void @simple_urem_to_sel(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB0_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB0_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_nested2(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_nested2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB1_8
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    jmp .LBB1_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB1_5: # %for.body1
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:  .LBB1_6: # %for.body.tail
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    je .LBB1_7
+; CHECK-NEXT:  .LBB1_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB1_6
+; CHECK-NEXT:  # %bb.3: # %for.body0
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB1_5
+; CHECK-NEXT:  # %bb.4: # %for.body2
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB1_5
+; CHECK-NEXT:    jmp .LBB1_6
+; CHECK-NEXT:  .LBB1_7:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB1_8: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body.tail ], [ 0, %entry ]
+  %cond0 = call i1 @get.i1()
+  br i1 %cond0, label %for.body0, label %for.body.tail
+for.body0:
+  %cond1 = call i1 @get.i1()
+  br i1 %cond1, label %for.body1, label %for.body2
+for.body2:
+  %cond2 = call i1 @get.i1()
+  br i1 %cond2, label %for.body1, label %for.body.tail
+for.body1:
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  br label %for.body.tail
+for.body.tail:
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr3(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_incr3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB2_9
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    jmp .LBB2_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB2_6: # %for.body1
+; CHECK-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:  .LBB2_7: # %for.body.tail
+; CHECK-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB2_8
+; CHECK-NEXT:  .LBB2_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB2_5
+; CHECK-NEXT:  # %bb.3: # %for.body0
+; CHECK-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    movl %eax, %r14d
+; CHECK-NEXT:    callq get.i32 at PLT
+; CHECK-NEXT:    testb $1, %r14b
+; CHECK-NEXT:    je .LBB2_7
+; CHECK-NEXT:  # %bb.4: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    movl %eax, %ebp
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    jmp .LBB2_6
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB2_5: # %for.body2
+; CHECK-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB2_6
+; CHECK-NEXT:    jmp .LBB2_7
+; CHECK-NEXT:  .LBB2_8:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB2_9: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %cond0 = call i1 @get.i1()
+  br i1 %cond0, label %for.body0, label %for.body2
+for.body0:
+  %cond1 = call i1 @get.i1()
+  %val = call i32 @get.i32()
+  %inc = add nuw i32 %val, 1
+  br i1 %cond1, label %for.body1, label %for.body.tail
+for.body2:
+  %cond2 = call i1 @get.i1()
+  br i1 %cond2, label %for.body1, label %for.body.tail
+for.body1:
+  %i.04 = phi i32 [ %inc, %for.body0], [ 0, %for.body2 ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  br label %for.body.tail
+for.body.tail:
+  %exitcond.not = call i1 @get.i1()
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_vec:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movq %xmm0, %rbx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; CHECK-NEXT:    movq %xmm0, %r14
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB3_1: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movq %xmm1, %rax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divq %rbx
+; CHECK-NEXT:    movq %rdx, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; CHECK-NEXT:    movq %xmm1, %rax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divq %r14
+; CHECK-NEXT:    movq %rdx, %xmm1
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    callq use.2xi64 at PLT
+; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    psubq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB3_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    retq
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi <2 x i64> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+  %rem = urem <2 x i64> %i.04, %rem_amt
+  tail call void @use.2xi64(<2 x i64> %rem)
+  %inc = add nuw <2 x i64> %i.04, <i64 1, i64 1>
+  %exitcond.not = call i1 @get.i1()
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_incr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB4_6
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    jmp .LBB4_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB4_4: # %for.body.tail
+; CHECK-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %ebp, %r14d
+; CHECK-NEXT:    je .LBB4_5
+; CHECK-NEXT:  .LBB4_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB4_4
+; CHECK-NEXT:  # %bb.3: # %for.body0
+; CHECK-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    callq get.i32 at PLT
+; CHECK-NEXT:    movl %eax, %r14d
+; CHECK-NEXT:    jmp .LBB4_4
+; CHECK-NEXT:  .LBB4_5:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB4_6: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.03 = phi i32 [ %inc, %for.body.tail ], [ 0, %entry ]
+  %cond0 = call i1 @get.i1()
+  br i1 %cond0, label %for.body0, label %for.body.tail
+for.body0:
+  %some_val = call i32 @get.i32()
+  br label %for.body.tail
+
+for.body.tail:
+  %i.04 = phi i32 [ %i.03, %for.body ], [ %some_val, %for.body0 ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_second_acc(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_second_acc:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    jb .LBB5_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    movl $1, %r15d
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB5_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    addl $2, %r15d
+; CHECK-NEXT:    cmpl %ebp, %r15d
+; CHECK-NEXT:    jbe .LBB5_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB5_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp ult i32 %N, 2
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %i.05 = phi i32 [ %inc2, %for.body ], [ 1, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %inc2 = add nuw i32 %i.05, 2
+  %exitcond.not = icmp ugt i32 %inc2, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_srem(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_srem:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB6_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB6_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    jne .LBB6_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB6_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %rem = srem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_missing_nuw(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_missing_nuw:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB7_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB7_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    jne .LBB7_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB7_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nsw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr2(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_incr2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB8_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB8_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    addl $2, %r14d
+; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    jne .LBB8_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB8_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 2
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_non_zero_entry4(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_non_zero_entry4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB9_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    movl $4, %r14d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB9_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    jne .LBB9_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB9_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 4, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_skip_const_rem_amt(i32 %N) nounwind {
+; CHECK-LABEL: simple_urem_skip_const_rem_amt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB10_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    addl $-4, %ebx
+; CHECK-NEXT:    movl $4, %ebp
+; CHECK-NEXT:    movl $2938661835, %r14d # imm = 0xAF286BCB
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB10_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    imulq %r14, %rax
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    movl %ebp, %ecx
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    shrl %ecx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    shrl $4, %ecx
+; CHECK-NEXT:    leal (%rcx,%rcx,8), %eax
+; CHECK-NEXT:    leal (%rcx,%rax,2), %eax
+; CHECK-NEXT:    movl %ebp, %edi
+; CHECK-NEXT:    subl %eax, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    decl %ebx
+; CHECK-NEXT:    jne .LBB10_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB10_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 4, %entry ]
+  %rem = urem i32 %i.04, 19
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_no_preheader_non_canonical(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_no_preheader_non_canonical:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB11_1
+; CHECK-NEXT:  # %bb.2: # %for.body1
+; CHECK-NEXT:    movl $1, %r14d
+; CHECK-NEXT:    jmp .LBB11_3
+; CHECK-NEXT:  .LBB11_1:
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB11_3: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    jne .LBB11_3
+; CHECK-NEXT:  # %bb.4: # %for.cond.cleanup
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.body0, label %for.body1
+
+for.cond.cleanup:
+  ret void
+
+for.body0:
+  br label %for.body
+
+for.body1:
+  br label %for.body
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %for.body0 ], [ 1, %for.body1 ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_multi_latch_non_canonical(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_multi_latch_non_canonical:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB12_6
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    decl %ebp
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    jmp .LBB12_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB12_3: # %for.body.backedge
+; CHECK-NEXT:    # in Loop: Header=BB12_2 Depth=1
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:  .LBB12_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    callq get.i1 at PLT
+; CHECK-NEXT:    movl %eax, %r15d
+; CHECK-NEXT:    callq do_stuff0 at PLT
+; CHECK-NEXT:    testb $1, %r15b
+; CHECK-NEXT:    je .LBB12_3
+; CHECK-NEXT:  # %bb.4: # %for.body0
+; CHECK-NEXT:    # in Loop: Header=BB12_2 Depth=1
+; CHECK-NEXT:    callq do_stuff1 at PLT
+; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    jne .LBB12_3
+; CHECK-NEXT:  # %bb.5:
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB12_6: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ %inc, %for.body0 ], [ 0, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %cond = call i1 @get.i1()
+  call void @do_stuff0()
+  br i1 %cond, label %for.body0, label %for.body
+for.body0:
+  call void @do_stuff1()
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_loop(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_loop:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    callq get.i32 at PLT
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    # implicit-def: $r14d
+; CHECK-NEXT:    jne .LBB13_4
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:  .LBB13_2: # %for.cond
+; CHECK-NEXT:    cmpl %ebp, %r14d
+; CHECK-NEXT:    jae .LBB13_5
+; CHECK-NEXT:  # %bb.3: # %for.body
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    xorl $1, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:  .LBB13_4: # %halfway
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    jmp .LBB13_2
+; CHECK-NEXT:  .LBB13_5: # %for.end
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+entry:
+  %call = call i32 @get.i32()
+  %tobool.not = icmp eq i32 %call, 0
+  br i1 %tobool.not, label %for.cond, label %halfway
+
+for.cond:
+  %i.0 = phi i32 [ %inc, %halfway ], [ 0, %entry ]
+  %cmp = icmp ult i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %xor = xor i32 %i.0, 1
+  call void @use.i32(i32 %xor)
+  br label %halfway
+
+halfway:
+  %i.1 = phi i32 [ poison, %entry ], [ %i.0, %for.body ]
+  %rem = urem i32 %i.1, %rem_amt
+  call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.1, 1
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+define void @simple_urem_fail_intermediate_inc(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_intermediate_inc:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB14_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    negl %r14d
+; CHECK-NEXT:    movl $1, %r15d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB14_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    leal 1(%r14,%r15), %eax
+; CHECK-NEXT:    movl %r15d, %ecx
+; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    cmpl $1, %eax
+; CHECK-NEXT:    movl %ecx, %r15d
+; CHECK-NEXT:    jne .LBB14_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:  .LBB14_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %inc2 = add nuw i32 %i.04, 1
+  %rem = urem i32 %inc2, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @weird_loop(i64 %sub.ptr.div.i56) personality ptr null {
+; CHECK-LABEL: weird_loop:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB15_1: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    jmp .LBB15_1
+entry:
+  br label %for.preheader
+
+for.preheader:
+  %i57.0540.us = phi i64 [ 0, %entry ], [ %add74.us, %for.body ]
+  %add74.us = add nuw i64 %i57.0540.us, 1
+  br label %for.body
+
+for.body:
+  %rem.us = urem i64 %i57.0540.us, %sub.ptr.div.i56
+  br i1 false, label %for.preheader, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_non_zero_start:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $3, %edi
+; CHECK-NEXT:    jb .LBB16_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    movl $2, %r14d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB16_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    jne .LBB16_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB16_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp ult i32 %N, 3
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_in) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_non_zero_start_through_add:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $3, %edi
+; CHECK-NEXT:    jb .LBB17_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    orl $16, %ebx
+; CHECK-NEXT:    negl %r14d
+; CHECK-NEXT:    movl $7, %r15d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB17_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    leal 1(%r14,%r15), %eax
+; CHECK-NEXT:    movl %r15d, %ecx
+; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    movl %ecx, %r15d
+; CHECK-NEXT:    jne .LBB17_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:  .LBB17_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %rem_amt = or i32 %rem_amt_in, 16
+  %cmp3.not = icmp ult i32 %N, 3
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %i_with_off = add nuw i32 %i.04, 5
+  %rem = urem i32 %i_with_off, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32 %N, i32 %rem_amt_in) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $3, %edi
+; CHECK-NEXT:    jb .LBB18_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    orl $16, %ebx
+; CHECK-NEXT:    negl %r14d
+; CHECK-NEXT:    movl $7, %r15d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB18_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    leal 1(%r14,%r15), %eax
+; CHECK-NEXT:    movl %r15d, %ecx
+; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    movl %ecx, %r15d
+; CHECK-NEXT:    jne .LBB18_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:  .LBB18_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %rem_amt = or i32 %rem_amt_in, 16
+  %cmp3.not = icmp ult i32 %N, 3
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %i_with_off = add i32 %i.04, 5
+  %rem = urem i32 %i_with_off, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $3, %edi
+; CHECK-NEXT:    jb .LBB19_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    negl %r14d
+; CHECK-NEXT:    movl $7, %r15d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB19_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    leal 1(%r14,%r15), %eax
+; CHECK-NEXT:    movl %r15d, %ecx
+; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    movl %ecx, %r15d
+; CHECK-NEXT:    jne .LBB19_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:  .LBB19_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp ult i32 %N, 3
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %i_with_off = add nuw i32 %i.04, 5
+  %rem = urem i32 %i_with_off, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start_through_sub(i32 %N, i32 %rem_amt, i32 %start) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_non_zero_start_through_sub:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    subl %edx, %ebp
+; CHECK-NEXT:    jbe .LBB20_3
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB20_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    jne .LBB20_2
+; CHECK-NEXT:  .LBB20_3: # %for.cond.cleanup
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp ule i32 %N, %start
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ %start, %entry ]
+  %i_with_off = sub nuw i32 %i.04, %start
+  %rem = urem i32 %i_with_off, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i32 %rem_amt, i32 %start) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_non_zero_start_through_sub_no_simplfy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl %edx, %edi
+; CHECK-NEXT:    jbe .LBB21_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edx, %r15d
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %r14d
+; CHECK-NEXT:    negl %r14d
+; CHECK-NEXT:    addl $-2, %r15d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB21_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32 at PLT
+; CHECK-NEXT:    leal 1(%r14,%r15), %eax
+; CHECK-NEXT:    movl %r15d, %ecx
+; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    cmpl $-2, %eax
+; CHECK-NEXT:    movl %ecx, %r15d
+; CHECK-NEXT:    jne .LBB21_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:  .LBB21_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp ule i32 %N, %start
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ %start, %entry ]
+  %i_with_off = sub nuw i32 %i.04, 2
+  %rem = urem i32 %i_with_off, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
new file mode 100644
index 0000000000000..a019679e65905
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
@@ -0,0 +1,858 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=x86_64-unknown-unknown --loop-simplify -codegenprepare -S | FileCheck %s
+
+declare void @use.i32(i32)
+declare void @use.2xi64(<2 x i64>)
+declare void @do_stuff0()
+declare void @do_stuff1()
+declare i1 @get.i1()
+declare i32 @get.i32()
+
+define void @simple_urem_to_sel(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_to_sel(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_nested2(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_to_sel_nested2(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY_TAIL:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COND0:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[COND0]], label %[[FOR_BODY0:.*]], label %[[FOR_BODY_TAIL]]
+; CHECK:       [[FOR_BODY0]]:
+; CHECK-NEXT:    [[COND1:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[COND1]], label %[[FOR_BODY1:.*]], label %[[FOR_BODY2:.*]]
+; CHECK:       [[FOR_BODY2]]:
+; CHECK-NEXT:    [[COND2:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[COND2]], label %[[FOR_BODY1]], label %[[FOR_BODY_TAIL]]
+; CHECK:       [[FOR_BODY1]]:
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    br label %[[FOR_BODY_TAIL]]
+; CHECK:       [[FOR_BODY_TAIL]]:
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body.tail ], [ 0, %entry ]
+  %cond0 = call i1 @get.i1()
+  br i1 %cond0, label %for.body0, label %for.body.tail
+for.body0:
+  %cond1 = call i1 @get.i1()
+  br i1 %cond1, label %for.body1, label %for.body2
+for.body2:
+  %cond2 = call i1 @get.i1()
+  br i1 %cond2, label %for.body1, label %for.body.tail
+for.body1:
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  br label %for.body.tail
+for.body.tail:
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr3(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_fail_bad_incr3(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[COND0:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[COND0]], label %[[FOR_BODY0:.*]], label %[[FOR_BODY2:.*]]
+; CHECK:       [[FOR_BODY0]]:
+; CHECK-NEXT:    [[COND1:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    [[VAL:%.*]] = call i32 @get.i32()
+; CHECK-NEXT:    [[INC:%.*]] = add nuw i32 [[VAL]], 1
+; CHECK-NEXT:    br i1 [[COND1]], label %[[FOR_BODY1:.*]], label %[[FOR_BODY_TAIL:.*]]
+; CHECK:       [[FOR_BODY2]]:
+; CHECK-NEXT:    [[COND2:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[COND2]], label %[[FOR_BODY1]], label %[[FOR_BODY_TAIL]]
+; CHECK:       [[FOR_BODY1]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC]], %[[FOR_BODY0]] ], [ 0, %[[FOR_BODY2]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    br label %[[FOR_BODY_TAIL]]
+; CHECK:       [[FOR_BODY_TAIL]]:
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %cond0 = call i1 @get.i1()
+  br i1 %cond0, label %for.body0, label %for.body2
+for.body0:
+  %cond1 = call i1 @get.i1()
+  %val = call i32 @get.i32()
+  %inc = add nuw i32 %val, 1
+  br i1 %cond1, label %for.body1, label %for.body.tail
+for.body2:
+  %cond2 = call i1 @get.i1()
+  br i1 %cond2, label %for.body1, label %for.body.tail
+for.body1:
+  %i.04 = phi i32 [ %inc, %for.body0], [ 0, %for.body2 ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  br label %for.body.tail
+for.body.tail:
+  %exitcond.not = call i1 @get.i1()
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_to_sel_vec(
+; CHECK-SAME: <2 x i64> [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem <2 x i64> [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.2xi64(<2 x i64> [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw <2 x i64> [[I_04]], <i64 1, i64 1>
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi <2 x i64> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+  %rem = urem <2 x i64> %i.04, %rem_amt
+  tail call void @use.2xi64(<2 x i64> %rem)
+  %inc = add nuw <2 x i64> %i.04, <i64 1, i64 1>
+  %exitcond.not = call i1 @get.i1()
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_fail_bad_incr(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_03:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY_TAIL:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COND0:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    br i1 [[COND0]], label %[[FOR_BODY0:.*]], label %[[FOR_BODY_TAIL]]
+; CHECK:       [[FOR_BODY0]]:
+; CHECK-NEXT:    [[SOME_VAL:%.*]] = call i32 @get.i32()
+; CHECK-NEXT:    br label %[[FOR_BODY_TAIL]]
+; CHECK:       [[FOR_BODY_TAIL]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[I_03]], %[[FOR_BODY]] ], [ [[SOME_VAL]], %[[FOR_BODY0]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.03 = phi i32 [ %inc, %for.body.tail ], [ 0, %entry ]
+  %cond0 = call i1 @get.i1()
+  br i1 %cond0, label %for.body0, label %for.body.tail
+for.body0:
+  %some_val = call i32 @get.i32()
+  br label %for.body.tail
+
+for.body.tail:
+  %i.04 = phi i32 [ %i.03, %for.body ], [ %some_val, %for.body0 ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_second_acc(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_to_sel_second_acc(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[I_05:%.*]] = phi i32 [ [[INC2:%.*]], %[[FOR_BODY]] ], [ 1, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[INC2]] = add nuw i32 [[I_05]], 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp ugt i32 [[INC2]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp ult i32 %N, 2
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %i.05 = phi i32 [ %inc2, %for.body ], [ 1, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %inc2 = add nuw i32 %i.05, 2
+  %exitcond.not = icmp ugt i32 %inc2, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_srem(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_fail_srem(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %rem = srem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_missing_nuw(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_fail_missing_nuw(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nsw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr2(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_fail_bad_incr2(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 2
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_non_zero_entry4(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_non_zero_entry4(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 4, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 4, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_skip_const_rem_amt(i32 %N) nounwind {
+; CHECK-LABEL: define void @simple_urem_skip_const_rem_amt(
+; CHECK-SAME: i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 4, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], 19
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 4, %entry ]
+  %rem = urem i32 %i.04, 19
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_no_preheader_non_canonical(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_fail_no_preheader_non_canonical(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_BODY1:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY1]]:
+; CHECK-NEXT:    br label %[[FOR_BODY_PREHEADER]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[I_04_PH:%.*]] = phi i32 [ 1, %[[FOR_BODY1]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[I_04_PH]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.body0, label %for.body1
+
+for.cond.cleanup:
+  ret void
+
+for.body0:
+  br label %for.body
+
+for.body1:
+  br label %for.body
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %for.body0 ], [ 1, %for.body1 ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_multi_latch_non_canonical(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_multi_latch_non_canonical(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[INC]], %[[FOR_BODY0:.*]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[COND:%.*]] = call i1 @get.i1()
+; CHECK-NEXT:    call void @do_stuff0()
+; CHECK-NEXT:    br i1 [[COND]], label %[[FOR_BODY0]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_BODY0]]:
+; CHECK-NEXT:    call void @do_stuff1()
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ %inc, %for.body0 ], [ 0, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %cond = call i1 @get.i1()
+  call void @do_stuff0()
+  br i1 %cond, label %for.body0, label %for.body
+for.body0:
+  call void @do_stuff1()
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_loop(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_fail_bad_loop(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @get.i32()
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CALL]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[FOR_COND:.*]], label %[[HALFWAY:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ [[INC:%.*]], %[[HALFWAY]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_0]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[I_0]], 1
+; CHECK-NEXT:    call void @use.i32(i32 [[XOR]])
+; CHECK-NEXT:    br label %[[HALFWAY]]
+; CHECK:       [[HALFWAY]]:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[I_0]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_1]], [[REM_AMT]]
+; CHECK-NEXT:    call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_1]], 1
+; CHECK-NEXT:    br label %[[FOR_COND]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @get.i32()
+  %tobool.not = icmp eq i32 %call, 0
+  br i1 %tobool.not, label %for.cond, label %halfway
+
+for.cond:
+  %i.0 = phi i32 [ %inc, %halfway ], [ 0, %entry ]
+  %cmp = icmp ult i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %xor = xor i32 %i.0, 1
+  call void @use.i32(i32 %xor)
+  br label %halfway
+
+halfway:
+  %i.1 = phi i32 [ poison, %entry ], [ %i.0, %for.body ]
+  %rem = urem i32 %i.1, %rem_amt
+  call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.1, 1
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+define void @simple_urem_fail_intermediate_inc(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_fail_intermediate_inc(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INC2:%.*]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[INC2]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %inc2 = add nuw i32 %i.04, 1
+  %rem = urem i32 %inc2, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @weird_loop(i64 %sub.ptr.div.i56) personality ptr null {
+; CHECK-LABEL: define void @weird_loop(
+; CHECK-SAME: i64 [[SUB_PTR_DIV_I56:%.*]]) personality ptr null {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ADD74_US:%.*]] = add nuw i64 0, 1
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM_US:%.*]] = urem i64 0, [[SUB_PTR_DIV_I56]]
+; CHECK-NEXT:    br label %[[FOR_BODY]]
+;
+entry:
+  br label %for.preheader
+
+for.preheader:
+  %i57.0540.us = phi i64 [ 0, %entry ], [ %add74.us, %for.body ]
+  %add74.us = add nuw i64 %i57.0540.us, 1
+  br label %for.body
+
+for.body:
+  %rem.us = urem i64 %i57.0540.us, %sub.ptr.div.i56
+  br i1 false, label %for.preheader, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_to_sel_non_zero_start(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ult i32 [[N]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp ult i32 %N, 3
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_in) nounwind {
+; CHECK-LABEL: define void @simple_urem_to_sel_non_zero_start_through_add(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT_IN:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[REM_AMT:%.*]] = or i32 [[REM_AMT_IN]], 16
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ult i32 [[N]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = add nuw i32 [[I_04]], 5
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %rem_amt = or i32 %rem_amt_in, 16
+  %cmp3.not = icmp ult i32 %N, 3
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %i_with_off = add nuw i32 %i.04, 5
+  %rem = urem i32 %i_with_off, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32 %N, i32 %rem_amt_in) nounwind {
+; CHECK-LABEL: define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT_IN:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[REM_AMT:%.*]] = or i32 [[REM_AMT_IN]], 16
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ult i32 [[N]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = add i32 [[I_04]], 5
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %rem_amt = or i32 %rem_amt_in, 16
+  %cmp3.not = icmp ult i32 %N, 3
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %i_with_off = add i32 %i.04, 5
+  %rem = urem i32 %i_with_off, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ult i32 [[N]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = add nuw i32 [[I_04]], 5
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp ult i32 %N, 3
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %i_with_off = add nuw i32 %i.04, 5
+  %rem = urem i32 %i_with_off, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start_through_sub(i32 %N, i32 %rem_amt, i32 %start) nounwind {
+; CHECK-LABEL: define void @simple_urem_to_sel_non_zero_start_through_sub(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ule i32 [[N]], [[START]]
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = sub nuw i32 [[I_04]], [[START]]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp ule i32 %N, %start
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ %start, %entry ]
+  %i_with_off = sub nuw i32 %i.04, %start
+  %rem = urem i32 %i_with_off, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i32 %rem_amt, i32 %start) nounwind {
+; CHECK-LABEL: define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[REM_AMT:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ule i32 [[N]], [[START]]
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = sub nuw i32 [[I_04]], 2
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
+; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %cmp3.not = icmp ule i32 %N, %start
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ %start, %entry ]
+  %i_with_off = sub nuw i32 %i.04, 2
+  %rem = urem i32 %i_with_off, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}

>From 441fa07861c3af026fdf1c1cdcd1b485a5fbf28b Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Tue, 25 Jun 2024 19:51:39 +0800
Subject: [PATCH 2/4] [CodeGenPrepare] Folding `urem` with loop invariant value

```
for(i = Start; i < End; ++i)
   Rem = (i nuw+ IncrLoopInvariant) u% RemAmtLoopInvariant;
```
 ->
```
Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
for(i = Start; i < End; ++i, ++rem)
   Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
```

In its current state, only if `IncrLoopInvariant` and `Start` both
being zero.

Alive2 seemed unable to prove this (see:
https://alive2.llvm.org/ce/z/ATGDp3 which is clearly wrong but still
checks out...) so wrote an exhaustive test here:
https://godbolt.org/z/WYa561388
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 160 ++++++++++++++++++
 llvm/test/CodeGen/X86/fold-loop-of-urem.ll    | 106 +++++++-----
 .../CodeGenPrepare/X86/fold-loop-of-urem.ll   |  22 ++-
 3 files changed, 240 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 22d0708f54786..e0232afa4d917 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -471,6 +471,7 @@ class CodeGenPrepare {
   bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, Value *Arg1,
                                    CmpInst *Cmp, Intrinsic::ID IID);
   bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT);
+  bool optimizeRem(Instruction *Rem);
   bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   void verifyBFIUpdates(Function &F);
@@ -1974,6 +1975,160 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
   return true;
 }
 
+static bool isRemOfLoopIncrementWithLoopInvariant(
+    Value *Rem, const LoopInfo *LI, Value *&RemAmtOut,
+    std::optional<bool> &AddOrSubOut, Value *&AddOrSubOffsetOut,
+    PHINode *&LoopIncrPNOut) {
+  Value *Incr, *RemAmt;
+  if (!isa<Instruction>(Rem))
+    return false;
+  // NB: If RemAmt is a power of 2 it *should* have been transformed by now.
+  if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt))))
+    return false;
+
+  // Only trivially analyzable loops.
+  Loop *L = LI->getLoopFor(cast<Instruction>(Rem)->getParent());
+  if (L == nullptr || L->getLoopPreheader() == nullptr ||
+      L->getLoopLatch() == nullptr)
+    return false;
+
+  std::optional<bool> AddOrSub;
+  Value *AddOrSubOffset;
+  // Find out loop increment PHI.
+  PHINode *PN = dyn_cast<PHINode>(Incr);
+  if (PN != nullptr) {
+    AddOrSub = std::nullopt;
+    AddOrSubOffset = nullptr;
+  } else {
+    // Search through a NUW add/sub.
+    Value *V0, *V1;
+    if (match(Incr, m_NUWAddLike(m_Value(V0), m_Value(V1))))
+      AddOrSub = true;
+    else if (match(Incr, m_NUWSub(m_Value(V0), m_Value(V1))))
+      AddOrSub = false;
+    else
+      return false;
+
+    PN = dyn_cast<PHINode>(V0);
+    if (PN != nullptr) {
+      AddOrSubOffset = V1;
+    } else if (*AddOrSub) {
+      PN = dyn_cast<PHINode>(V1);
+      AddOrSubOffset = V0;
+    }
+  }
+
+  if (PN == nullptr)
+    return false;
+
+  // This isn't strictly necessary, what we really need is one increment and any
+  // amount of initial values all being the same.
+  if (PN->getNumIncomingValues() != 2)
+    return false;
+
+  // Only works if the remainder amount is a loop invaraint
+  if (!L->isLoopInvariant(RemAmt))
+    return false;
+
+  // Is the PHI a loop increment?
+  auto LoopIncrInfo = getIVIncrement(PN, LI);
+  if (!LoopIncrInfo.has_value())
+    return false;
+
+  // We need remainder_amount % increment_amount to be zero. Increment of one
+  // satisfies that without any special logic and is overwhelmingly the common
+  // case.
+  if (!match(LoopIncrInfo->second, m_One()))
+    return false;
+
+  // Need the increment to not overflow.
+  if (!match(LoopIncrInfo->first, m_NUWAdd(m_Value(), m_Value())))
+    return false;
+
+  if (PN->getBasicBlockIndex(L->getLoopLatch()) < 0 ||
+      PN->getBasicBlockIndex(L->getLoopPreheader()) < 0)
+    return false;
+
+  // Set output variables.
+  RemAmtOut = RemAmt;
+  LoopIncrPNOut = PN;
+  AddOrSubOut = AddOrSub;
+  AddOrSubOffsetOut = AddOrSubOffset;
+
+  return true;
+}
+
+// Try to transform:
+//
+// for(i = Start; i < End; ++i)
+//    Rem = (i nuw+ IncrLoopInvariant) u% RemAmtLoopInvariant;
+//
+// ->
+//
+// Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
+// for(i = Start; i < End; ++i, ++rem)
+//    Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
+//
+// Currently only implemented for `Start` and `IncrLoopInvariant` being zero.
+static bool foldURemOfLoopIncrement(Instruction *Rem, const LoopInfo *LI,
+                                    SmallSet<BasicBlock *, 32> &FreshBBs,
+                                    bool IsHuge) {
+  std::optional<bool> AddOrSub;
+  Value *AddOrSubOffset, *RemAmt;
+  PHINode *LoopIncrPN;
+  if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, AddOrSub,
+                                             AddOrSubOffset, LoopIncrPN))
+    return false;
+
+  // Only non-constant remainder as the extra IV is is probably not profitable
+  // in that case. Further, since remainder amount is non-constant, only handle
+  // case where `IncrLoopInvariant` and `Start` are 0 to entirely eliminate the
+  // rem (as opposed to just hoisting it outside of the loop).
+  //
+  // Potential TODO: Should we have a check for how "nested" this remainder
+  // operation is? The new code runs every iteration so if the remainder is
+  // guarded behind unlikely conditions this might not be worth it.
+  if (AddOrSub.has_value() || match(RemAmt, m_ImmConstant()))
+    return false;
+  Loop *L = LI->getLoopFor(Rem->getParent());
+  if (!match(LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader()),
+             m_Zero()))
+    return false;
+
+  // Create new remainder with induction variable.
+  Type *Ty = Rem->getType();
+  IRBuilder<> Builder(Rem->getContext());
+
+  Builder.SetInsertPoint(LoopIncrPN);
+  PHINode *NewRem = Builder.CreatePHI(Ty, 2);
+
+  Builder.SetInsertPoint(cast<Instruction>(
+      LoopIncrPN->getIncomingValueForBlock(L->getLoopLatch())));
+  // `(add (urem x, y), 1)` is always nuw.
+  Value *RemAdd = Builder.CreateNUWAdd(NewRem, ConstantInt::get(Ty, 1));
+  Value *RemCmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, RemAdd, RemAmt);
+  Value *RemSel =
+      Builder.CreateSelect(RemCmp, Constant::getNullValue(Ty), RemAdd);
+
+  NewRem->addIncoming(Constant::getNullValue(Ty), L->getLoopPreheader());
+  NewRem->addIncoming(RemSel, L->getLoopLatch());
+
+  // Insert all touched BBs.
+  FreshBBs.insert(LoopIncrPN->getParent());
+  FreshBBs.insert(L->getLoopLatch());
+  FreshBBs.insert(Rem->getParent());
+
+  replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge);
+  Rem->eraseFromParent();
+  return true;
+}
+
+bool CodeGenPrepare::optimizeRem(Instruction *Rem) {
+  if (foldURemOfLoopIncrement(Rem, LI, FreshBBs, IsHugeFunc))
+    return true;
+  return false;
+}
+
 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (sinkCmpExpression(Cmp, *TLI))
     return true;
@@ -8360,6 +8515,11 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
     if (optimizeCmp(Cmp, ModifiedDT))
       return true;
 
+  if (match(I, m_URem(m_Value(), m_Value())) ||
+      match(I, m_SRem(m_Value(), m_Value())))
+    if (optimizeRem(I))
+      return true;
+
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
     bool Modified = optimizeLoadExt(LI);
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
index aad2e0dd7bd24..c2c1124b1ac11 100644
--- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -15,25 +15,31 @@ define void @simple_urem_to_sel(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK-NEXT:    je .LBB0_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r15d, %r15d
 ; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r12d, %r12d
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r14d, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %ebx
-; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    movl %r14d, %edi
 ; CHECK-NEXT:    callq use.i32 at PLT
 ; CHECK-NEXT:    incl %r14d
-; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    cmpl %ebx, %r14d
+; CHECK-NEXT:    cmovel %r15d, %r14d
+; CHECK-NEXT:    incl %r12d
+; CHECK-NEXT:    cmpl %r12d, %ebp
 ; CHECK-NEXT:    jne .LBB0_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB0_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
@@ -60,24 +66,28 @@ define void @simple_urem_to_sel_nested2(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK-NEXT:    je .LBB1_8
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r15d, %r15d
 ; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r12d, %r12d
 ; CHECK-NEXT:    jmp .LBB1_2
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB1_5: # %for.body1
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    movl %r14d, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %ebx
-; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    movl %r14d, %edi
 ; CHECK-NEXT:    callq use.i32 at PLT
 ; CHECK-NEXT:  .LBB1_6: # %for.body.tail
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    incl %r14d
-; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    cmpl %ebx, %r14d
+; CHECK-NEXT:    cmovel %r15d, %r14d
+; CHECK-NEXT:    incl %r12d
+; CHECK-NEXT:    cmpl %r12d, %ebp
 ; CHECK-NEXT:    je .LBB1_7
 ; CHECK-NEXT:  .LBB1_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -97,7 +107,9 @@ define void @simple_urem_to_sel_nested2(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK-NEXT:    jmp .LBB1_6
 ; CHECK-NEXT:  .LBB1_7:
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB1_8: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
@@ -213,40 +225,36 @@ for.body.tail:
 define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
 ; CHECK-LABEL: simple_urem_to_sel_vec:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    pxor %xmm1, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movq %xmm0, %rbx
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-NEXT:    movq %xmm0, %r14
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB3_1: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movq %xmm1, %rax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divq %rbx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; CHECK-NEXT:    movq %xmm1, %rax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divq %r14
-; CHECK-NEXT:    movq %rdx, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    callq use.2xi64 at PLT
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psubq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    movdqa (%rsp), %xmm2 # 16-byte Reload
+; CHECK-NEXT:    psubq %xmm1, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm2, %xmm3
+; CHECK-NEXT:    pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; CHECK-NEXT:    pand %xmm0, %xmm2
+; CHECK-NEXT:    pandn %xmm3, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    psubq %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    callq get.i1 at PLT
 ; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    je .LBB3_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    addq $56, %rsp
 ; CHECK-NEXT:    retq
 entry:
   br label %for.body
@@ -336,27 +344,33 @@ define void @simple_urem_to_sel_second_acc(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %ebp
 ; CHECK-NEXT:    movl $1, %r15d
+; CHECK-NEXT:    xorl %r12d, %r12d
 ; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r13d, %r13d
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB5_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r14d, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %ebx
-; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    movl %r14d, %edi
 ; CHECK-NEXT:    callq use.i32 at PLT
 ; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %ebx, %r14d
+; CHECK-NEXT:    cmovel %r12d, %r14d
+; CHECK-NEXT:    incl %r13d
 ; CHECK-NEXT:    addl $2, %r15d
 ; CHECK-NEXT:    cmpl %ebp, %r15d
 ; CHECK-NEXT:    jbe .LBB5_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    addq $8, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
 ; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
@@ -676,23 +690,27 @@ define void @simple_urem_multi_latch_non_canonical(i32 %N, i32 %rem_amt) nounwin
 ; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %ebp
 ; CHECK-NEXT:    decl %ebp
+; CHECK-NEXT:    xorl %r12d, %r12d
 ; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r13d, %r13d
 ; CHECK-NEXT:    jmp .LBB12_2
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB12_3: # %for.body.backedge
 ; CHECK-NEXT:    # in Loop: Header=BB12_2 Depth=1
 ; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %ebx, %r14d
+; CHECK-NEXT:    cmovel %r12d, %r14d
+; CHECK-NEXT:    incl %r13d
 ; CHECK-NEXT:  .LBB12_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r14d, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %ebx
-; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    movl %r14d, %edi
 ; CHECK-NEXT:    callq use.i32 at PLT
 ; CHECK-NEXT:    callq get.i1 at PLT
 ; CHECK-NEXT:    movl %eax, %r15d
@@ -702,11 +720,13 @@ define void @simple_urem_multi_latch_non_canonical(i32 %N, i32 %rem_amt) nounwin
 ; CHECK-NEXT:  # %bb.4: # %for.body0
 ; CHECK-NEXT:    # in Loop: Header=BB12_2 Depth=1
 ; CHECK-NEXT:    callq do_stuff1 at PLT
-; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    cmpl %r13d, %ebp
 ; CHECK-NEXT:    jne .LBB12_3
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    addq $8, %rsp
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
 ; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
index a019679e65905..2f4652b425753 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
@@ -19,9 +19,12 @@ define void @simple_urem_to_sel(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
@@ -53,7 +56,8 @@ define void @simple_urem_to_sel_nested2(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY_TAIL:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY_TAIL:.*]] ]
+; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY_TAIL]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[COND0:%.*]] = call i1 @get.i1()
 ; CHECK-NEXT:    br i1 [[COND0]], label %[[FOR_BODY0:.*]], label %[[FOR_BODY_TAIL]]
 ; CHECK:       [[FOR_BODY0]]:
@@ -63,10 +67,12 @@ define void @simple_urem_to_sel_nested2(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK-NEXT:    [[COND2:%.*]] = call i1 @get.i1()
 ; CHECK-NEXT:    br i1 [[COND2]], label %[[FOR_BODY1]], label %[[FOR_BODY_TAIL]]
 ; CHECK:       [[FOR_BODY1]]:
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
 ; CHECK-NEXT:    br label %[[FOR_BODY_TAIL]]
 ; CHECK:       [[FOR_BODY_TAIL]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
@@ -164,9 +170,12 @@ define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
 ; CHECK:       [[FOR_COND_CLEANUP:.*]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT:    [[REM:%.*]] = urem <2 x i64> [[I_04]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.2xi64(<2 x i64> [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw <2 x i64> [[REM]], <i64 1, i64 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select <2 x i1> [[TMP2]], <2 x i64> zeroinitializer, <2 x i64> [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw <2 x i64> [[I_04]], <i64 1, i64 1>
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = call i1 @get.i1()
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
@@ -246,10 +255,13 @@ define void @simple_urem_to_sel_second_acc(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[I_05:%.*]] = phi i32 [ [[INC2:%.*]], %[[FOR_BODY]] ], [ 1, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[INC2]] = add nuw i32 [[I_05]], 2
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp ugt i32 [[INC2]], [[N]]

>From 336f2acd2be0aaece58bb56f549d98d51ac685fd Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Thu, 18 Jul 2024 20:01:51 +0800
Subject: [PATCH 3/4] more better comments

---
 llvm/lib/CodeGen/CodeGenPrepare.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index e0232afa4d917..d3ed74fcb9dc7 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2000,7 +2000,7 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
     AddOrSub = std::nullopt;
     AddOrSubOffset = nullptr;
   } else {
-    // Search through a NUW add/sub.
+    // Search through a NUW add/sub on top of the loop increment.
     Value *V0, *V1;
     if (match(Incr, m_NUWAddLike(m_Value(V0), m_Value(V1))))
       AddOrSub = true;
@@ -2045,6 +2045,7 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
   if (!match(LoopIncrInfo->first, m_NUWAdd(m_Value(), m_Value())))
     return false;
 
+  // Need unique loop preheader and latch.
   if (PN->getBasicBlockIndex(L->getLoopLatch()) < 0 ||
       PN->getBasicBlockIndex(L->getLoopPreheader()) < 0)
     return false;
@@ -2080,12 +2081,16 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const LoopInfo *LI,
                                              AddOrSubOffset, LoopIncrPN))
     return false;
 
-  // Only non-constant remainder as the extra IV is is probably not profitable
+  // Only non-constant remainder as the extra IV is probably not profitable
   // in that case. Further, since remainder amount is non-constant, only handle
   // case where `IncrLoopInvariant` and `Start` are 0 to entirely eliminate the
   // rem (as opposed to just hoisting it outside of the loop).
   //
-  // Potential TODO: Should we have a check for how "nested" this remainder
+  // Potential TODO(1): `urem` of a const ends up as `mul` + `shift` + `add`. If
+  // we can rule out register pressure and ensure this `urem` is executed each
+  // iteration, its probably profitable to handle the const case as well.
+  //
+  // Potential TODO(2): Should we have a check for how "nested" this remainder
   // operation is? The new code runs every iteration so if the remainder is
   // guarded behind unlikely conditions this might not be worth it.
   if (AddOrSub.has_value() || match(RemAmt, m_ImmConstant()))

>From e88b6f144e74b29f5f99801e8798b4bd41ae21bd Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Fri, 26 Jul 2024 01:30:11 +0800
Subject: [PATCH 4/4] handle add sub

---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 92 ++++++++++++-------
 llvm/test/CodeGen/X86/fold-loop-of-urem.ll    | 32 +++++--
 .../CodeGenPrepare/X86/fold-loop-of-urem.ll   | 27 ++++--
 3 files changed, 100 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index d3ed74fcb9dc7..abf36c281eaed 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -471,7 +471,7 @@ class CodeGenPrepare {
   bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, Value *Arg1,
                                    CmpInst *Cmp, Intrinsic::ID IID);
   bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT);
-  bool optimizeRem(Instruction *Rem);
+  bool optimizeURem(Instruction *Rem);
   bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   void verifyBFIUpdates(Function &F);
@@ -1976,26 +1976,23 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
 }
 
 static bool isRemOfLoopIncrementWithLoopInvariant(
-    Value *Rem, const LoopInfo *LI, Value *&RemAmtOut,
-    std::optional<bool> &AddOrSubOut, Value *&AddOrSubOffsetOut,
-    PHINode *&LoopIncrPNOut) {
+    Instruction *Rem, const LoopInfo *LI, Value *&RemAmtOut,
+    std::optional<bool> &AddOrSubOut, Value *&AddOrSubInstOut,
+    Value *&AddOrSubOffsetOut, PHINode *&LoopIncrPNOut) {
   Value *Incr, *RemAmt;
-  if (!isa<Instruction>(Rem))
-    return false;
   // NB: If RemAmt is a power of 2 it *should* have been transformed by now.
   if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt))))
     return false;
 
   // Only trivially analyzable loops.
-  Loop *L = LI->getLoopFor(cast<Instruction>(Rem)->getParent());
-  if (L == nullptr || L->getLoopPreheader() == nullptr ||
-      L->getLoopLatch() == nullptr)
+  Loop *L = LI->getLoopFor(Rem->getParent());
+  if (!L || !L->getLoopPreheader() || !L->getLoopLatch())
     return false;
 
   std::optional<bool> AddOrSub;
   Value *AddOrSubOffset;
   // Find out loop increment PHI.
-  PHINode *PN = dyn_cast<PHINode>(Incr);
+  auto *PN = dyn_cast<PHINode>(Incr);
   if (PN != nullptr) {
     AddOrSub = std::nullopt;
     AddOrSubOffset = nullptr;
@@ -2009,6 +2006,8 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
     else
       return false;
 
+    AddOrSubInstOut = Incr;
+
     PN = dyn_cast<PHINode>(V0);
     if (PN != nullptr) {
       AddOrSubOffset = V1;
@@ -2018,7 +2017,7 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
     }
   }
 
-  if (PN == nullptr)
+  if (!PN)
     return false;
 
   // This isn't strictly necessary, what we really need is one increment and any
@@ -2032,7 +2031,12 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
 
   // Is the PHI a loop increment?
   auto LoopIncrInfo = getIVIncrement(PN, LI);
-  if (!LoopIncrInfo.has_value())
+  if (!LoopIncrInfo)
+    return false;
+
+  // getIVIncrement finds the loop at PN->getParent(). This might be a different
+  // loop from the loop with Rem->getParent().
+  if (L->getHeader() != PN->getParent())
     return false;
 
   // We need remainder_amount % increment_amount to be zero. Increment of one
@@ -2045,11 +2049,6 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
   if (!match(LoopIncrInfo->first, m_NUWAdd(m_Value(), m_Value())))
     return false;
 
-  // Need unique loop preheader and latch.
-  if (PN->getBasicBlockIndex(L->getLoopLatch()) < 0 ||
-      PN->getBasicBlockIndex(L->getLoopPreheader()) < 0)
-    return false;
-
   // Set output variables.
   RemAmtOut = RemAmt;
   LoopIncrPNOut = PN;
@@ -2071,20 +2070,19 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
 //    Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
 //
 // Currently only implemented for `Start` and `IncrLoopInvariant` being zero.
-static bool foldURemOfLoopIncrement(Instruction *Rem, const LoopInfo *LI,
+static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
+                                    const LoopInfo *LI,
                                     SmallSet<BasicBlock *, 32> &FreshBBs,
                                     bool IsHuge) {
   std::optional<bool> AddOrSub;
-  Value *AddOrSubOffset, *RemAmt;
+  Value *AddOrSubOffset, *RemAmt, *AddOrSubInst;
   PHINode *LoopIncrPN;
-  if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, AddOrSub,
-                                             AddOrSubOffset, LoopIncrPN))
+  if (!isRemOfLoopIncrementWithLoopInvariant(
+          Rem, LI, RemAmt, AddOrSub, AddOrSubInst, AddOrSubOffset, LoopIncrPN))
     return false;
 
   // Only non-constant remainder as the extra IV is probably not profitable
-  // in that case. Further, since remainder amount is non-constant, only handle
-  // case where `IncrLoopInvariant` and `Start` are 0 to entirely eliminate the
-  // rem (as opposed to just hoisting it outside of the loop).
+  // in that case.
   //
   // Potential TODO(1): `urem` of a const ends up as `mul` + `shift` + `add`. If
   // we can rule out register pressure and ensure this `urem` is executed each
@@ -2093,12 +2091,37 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const LoopInfo *LI,
   // Potential TODO(2): Should we have a check for how "nested" this remainder
   // operation is? The new code runs every iteration so if the remainder is
   // guarded behind unlikely conditions this might not be worth it.
-  if (AddOrSub.has_value() || match(RemAmt, m_ImmConstant()))
+  if (match(RemAmt, m_ImmConstant()))
     return false;
   Loop *L = LI->getLoopFor(Rem->getParent());
-  if (!match(LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader()),
-             m_Zero()))
-    return false;
+
+  // If we have add/sub create initial value for remainder.
+  // The logic here is:
+  // (urem (add/sub nuw Start, IncrLoopInvariant), RemAmtLoopInvariant
+  //
+  // Only proceed if the expression simplifies (otherwise we can't fully
+  // optimize out the urem).
+  Value *Start = LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader());
+  if (AddOrSub) {
+    assert(AddOrSubOffset && AddOrSubInst &&
+           "We found an add/sub but missing values");
+    // Without dom-condition/assumption cache we aren't likely to get much out
+    // of a context instruction.
+    const SimplifyQuery Q(*DL);
+    bool NSW = cast<OverflowingBinaryOperator>(AddOrSubInst)->hasNoSignedWrap();
+    if (*AddOrSub)
+      Start = simplifyAddInst(Start, AddOrSubOffset, /*IsNSW=*/NSW,
+                              /*IsNUW=*/true, Q);
+    else
+      Start = simplifySubInst(Start, AddOrSubOffset, /*IsNSW=*/NSW,
+                              /*IsNUW=*/true, Q);
+    if (!Start)
+      return false;
+
+    Start = simplifyURemInst(Start, RemAmt, Q);
+    if (!Start)
+      return false;
+  }
 
   // Create new remainder with induction variable.
   Type *Ty = Rem->getType();
@@ -2115,7 +2138,7 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const LoopInfo *LI,
   Value *RemSel =
       Builder.CreateSelect(RemCmp, Constant::getNullValue(Ty), RemAdd);
 
-  NewRem->addIncoming(Constant::getNullValue(Ty), L->getLoopPreheader());
+  NewRem->addIncoming(Start, L->getLoopPreheader());
   NewRem->addIncoming(RemSel, L->getLoopLatch());
 
   // Insert all touched BBs.
@@ -2125,11 +2148,13 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const LoopInfo *LI,
 
   replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge);
   Rem->eraseFromParent();
+  if (AddOrSubInst && AddOrSubInst->use_empty())
+    cast<Instruction>(AddOrSubInst)->eraseFromParent();
   return true;
 }
 
-bool CodeGenPrepare::optimizeRem(Instruction *Rem) {
-  if (foldURemOfLoopIncrement(Rem, LI, FreshBBs, IsHugeFunc))
+bool CodeGenPrepare::optimizeURem(Instruction *Rem) {
+  if (foldURemOfLoopIncrement(Rem, DL, LI, FreshBBs, IsHugeFunc))
     return true;
   return false;
 }
@@ -8520,9 +8545,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
     if (optimizeCmp(Cmp, ModifiedDT))
       return true;
 
-  if (match(I, m_URem(m_Value(), m_Value())) ||
-      match(I, m_SRem(m_Value(), m_Value())))
-    if (optimizeRem(I))
+  if (match(I, m_URem(m_Value(), m_Value())))
+    if (optimizeURem(I))
       return true;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
index c2c1124b1ac11..b9fe4d7c79c7f 100644
--- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -890,25 +890,31 @@ define void @simple_urem_to_sel_non_zero_start(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK-NEXT:    jb .LBB16_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %ebp
 ; CHECK-NEXT:    movl $2, %r14d
+; CHECK-NEXT:    xorl %r15d, %r15d
+; CHECK-NEXT:    movl $2, %r12d
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB16_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r14d, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %ebx
-; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    movl %r14d, %edi
 ; CHECK-NEXT:    callq use.i32 at PLT
 ; CHECK-NEXT:    incl %r14d
-; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    cmpl %ebx, %r14d
+; CHECK-NEXT:    cmovel %r15d, %r14d
+; CHECK-NEXT:    incl %r12d
+; CHECK-NEXT:    cmpl %r12d, %ebp
 ; CHECK-NEXT:    jne .LBB16_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB16_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
@@ -1086,28 +1092,34 @@ define void @simple_urem_to_sel_non_zero_start_through_sub(i32 %N, i32 %rem_amt,
 ; CHECK-LABEL: simple_urem_to_sel_non_zero_start_through_sub:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %edi, %ebp
 ; CHECK-NEXT:    subl %edx, %ebp
 ; CHECK-NEXT:    jbe .LBB20_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    xorl %r15d, %r15d
 ; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r12d, %r12d
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB20_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r14d, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %ebx
-; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    movl %r14d, %edi
 ; CHECK-NEXT:    callq use.i32 at PLT
 ; CHECK-NEXT:    incl %r14d
-; CHECK-NEXT:    cmpl %r14d, %ebp
+; CHECK-NEXT:    cmpl %ebx, %r14d
+; CHECK-NEXT:    cmovel %r15d, %r14d
+; CHECK-NEXT:    incl %r12d
+; CHECK-NEXT:    cmpl %r12d, %ebp
 ; CHECK-NEXT:    jne .LBB20_2
 ; CHECK-NEXT:  .LBB20_3: # %for.cond.cleanup
 ; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
index 2f4652b425753..61bbe08e293b3 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
@@ -398,9 +398,12 @@ define void @simple_urem_non_zero_entry4(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ 4, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 4, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
@@ -469,9 +472,12 @@ define void @simple_urem_fail_no_preheader_non_canonical(i32 %N, i32 %rem_amt) n
 ; CHECK-NEXT:    [[I_04_PH:%.*]] = phi i32 [ 1, %[[FOR_BODY1]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ [[I_04_PH]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[I_04_PH]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
@@ -662,9 +668,12 @@ define void @simple_urem_to_sel_non_zero_start(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ 2, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_04]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
@@ -697,10 +706,12 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ 7, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = add nuw i32 [[I_04]], 5
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
@@ -808,10 +819,12 @@ define void @simple_urem_to_sel_non_zero_start_through_sub(i32 %N, i32 %rem_amt,
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[REM:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[I_WITH_OFF:%.*]] = sub nuw i32 [[I_04]], [[START]]
-; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
 ; CHECK-NEXT:    tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT:    [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_04]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]



More information about the llvm-commits mailing list