[llvm] goldsteinn/cgp urem of liv (PR #96625)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 25 05:08:52 PDT 2024
https://github.com/goldsteinn created https://github.com/llvm/llvm-project/pull/96625
- **[CodeGenPrepare][X86] Add tests for folding `urem` with loop invariant value; NFC**
- **[CodeGenPrepare] Folding `urem` with loop invariant value**
>From c94889ed0663ce50bd44167689b8eb0cd58c6967 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Tue, 25 Jun 2024 19:50:51 +0800
Subject: [PATCH 1/2] [CodeGenPrepare][X86] Add tests for folding `urem` with
loop invariant value; NFC
---
llvm/test/CodeGen/X86/fold-loop-of-urem.ll | 844 +++++++++++++++++++++
1 file changed, 844 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/fold-loop-of-urem.ll
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
new file mode 100644
index 0000000000000..21e222bf2b1ae
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -0,0 +1,844 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare void @use.i32(i32)
+declare void @use.2xi64(<2 x i64>)
+declare void @do_stuff0()
+declare void @do_stuff1()
+declare i1 @get.i1()
+declare i32 @get.i32()
+
+define void @simple_urem_to_sel(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB0_4
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB0_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %r14d, %ebp
+; CHECK-NEXT: jne .LBB0_2
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB0_4: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_nested2(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_nested2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB1_8
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: jmp .LBB1_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB1_5: # %for.body1
+; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: .LBB1_6: # %for.body.tail
+; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %r14d, %ebp
+; CHECK-NEXT: je .LBB1_7
+; CHECK-NEXT: .LBB1_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: callq get.i1 at PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: je .LBB1_6
+; CHECK-NEXT: # %bb.3: # %for.body0
+; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: callq get.i1 at PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: jne .LBB1_5
+; CHECK-NEXT: # %bb.4: # %for.body2
+; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: callq get.i1 at PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: jne .LBB1_5
+; CHECK-NEXT: jmp .LBB1_6
+; CHECK-NEXT: .LBB1_7:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB1_8: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body.tail ], [ 0, %entry ]
+ %cond0 = call i1 @get.i1()
+ br i1 %cond0, label %for.body0, label %for.body.tail
+for.body0:
+ %cond1 = call i1 @get.i1()
+ br i1 %cond1, label %for.body1, label %for.body2
+for.body2:
+ %cond2 = call i1 @get.i1()
+ br i1 %cond2, label %for.body1, label %for.body.tail
+for.body1:
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ br label %for.body.tail
+for.body.tail:
+ %inc = add nuw i32 %i.04, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr3(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_incr3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB2_9
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: jmp .LBB2_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB2_6: # %for.body1
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: movl %ebp, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: .LBB2_7: # %for.body.tail
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: callq get.i1 at PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: jne .LBB2_8
+; CHECK-NEXT: .LBB2_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: callq get.i1 at PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: je .LBB2_5
+; CHECK-NEXT: # %bb.3: # %for.body0
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: callq get.i1 at PLT
+; CHECK-NEXT: movl %eax, %r14d
+; CHECK-NEXT: callq get.i32 at PLT
+; CHECK-NEXT: testb $1, %r14b
+; CHECK-NEXT: je .LBB2_7
+; CHECK-NEXT: # %bb.4: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: movl %eax, %ebp
+; CHECK-NEXT: incl %ebp
+; CHECK-NEXT: jmp .LBB2_6
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB2_5: # %for.body2
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: xorl %ebp, %ebp
+; CHECK-NEXT: callq get.i1 at PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: jne .LBB2_6
+; CHECK-NEXT: jmp .LBB2_7
+; CHECK-NEXT: .LBB2_8:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB2_9: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %cond0 = call i1 @get.i1()
+ br i1 %cond0, label %for.body0, label %for.body2
+for.body0:
+ %cond1 = call i1 @get.i1()
+ %val = call i32 @get.i32()
+ %inc = add nuw i32 %val, 1
+ br i1 %cond1, label %for.body1, label %for.body.tail
+for.body2:
+ %cond2 = call i1 @get.i1()
+ br i1 %cond2, label %for.body1, label %for.body.tail
+for.body1:
+ %i.04 = phi i32 [ %inc, %for.body0], [ 0, %for.body2 ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ br label %for.body.tail
+for.body.tail:
+ %exitcond.not = call i1 @get.i1()
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_vec:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $24, %rsp
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT: movq %xmm0, %rbx
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; CHECK-NEXT: movq %xmm0, %r14
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB3_1: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT: movq %xmm1, %rax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divq %rbx
+; CHECK-NEXT: movq %rdx, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; CHECK-NEXT: movq %xmm1, %rax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divq %r14
+; CHECK-NEXT: movq %rdx, %xmm1
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: callq use.2xi64 at PLT
+; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: psubq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: callq get.i1 at PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: je .LBB3_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: retq
+entry:
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi <2 x i64> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+ %rem = urem <2 x i64> %i.04, %rem_amt
+ tail call void @use.2xi64(<2 x i64> %rem)
+ %inc = add nuw <2 x i64> %i.04, <i64 1, i64 1>
+ %exitcond.not = call i1 @get.i1()
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_incr:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB4_6
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: jmp .LBB4_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB4_4: # %for.body.tail
+; CHECK-NEXT: # in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %ebp, %r14d
+; CHECK-NEXT: je .LBB4_5
+; CHECK-NEXT: .LBB4_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: callq get.i1 at PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: je .LBB4_4
+; CHECK-NEXT: # %bb.3: # %for.body0
+; CHECK-NEXT: # in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: callq get.i32 at PLT
+; CHECK-NEXT: movl %eax, %r14d
+; CHECK-NEXT: jmp .LBB4_4
+; CHECK-NEXT: .LBB4_5:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB4_6: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.03 = phi i32 [ %inc, %for.body.tail ], [ 0, %entry ]
+ %cond0 = call i1 @get.i1()
+ br i1 %cond0, label %for.body0, label %for.body.tail
+for.body0:
+ %some_val = call i32 @get.i32()
+ br label %for.body.tail
+
+for.body.tail:
+ %i.04 = phi i32 [ %i.03, %for.body ], [ %some_val, %for.body0 ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_second_acc(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_second_acc:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmpl $2, %edi
+; CHECK-NEXT: jb .LBB5_4
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl $1, %r15d
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB5_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: addl $2, %r15d
+; CHECK-NEXT: cmpl %ebp, %r15d
+; CHECK-NEXT: jbe .LBB5_2
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: addq $8, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB5_4: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp ult i32 %N, 2
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %i.05 = phi i32 [ %inc2, %for.body ], [ 1, %entry ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 1
+ %inc2 = add nuw i32 %i.05, 2
+ %exitcond.not = icmp ugt i32 %inc2, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_srem(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_srem:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB6_4
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB6_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %r14d, %ebp
+; CHECK-NEXT: jne .LBB6_2
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB6_4: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %rem = srem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_missing_nuw(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_missing_nuw:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB7_4
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB7_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %r14d, %ebp
+; CHECK-NEXT: jne .LBB7_2
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nsw i32 %i.04, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr2(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_incr2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB8_4
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB8_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: addl $2, %r14d
+; CHECK-NEXT: cmpl %r14d, %ebp
+; CHECK-NEXT: jne .LBB8_2
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB8_4: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 2
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_non_zero_entry(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_non_zero_entry:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB9_4
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl $4, %r14d
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB9_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %r14d, %ebp
+; CHECK-NEXT: jne .LBB9_2
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB9_4: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body ], [ 4, %entry ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_skip_const_rem_amt(i32 %N) nounwind {
+; CHECK-LABEL: simple_urem_skip_const_rem_amt:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB10_4
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %edi, %ebx
+; CHECK-NEXT: addl $-4, %ebx
+; CHECK-NEXT: movl $4, %ebp
+; CHECK-NEXT: movl $2938661835, %r14d # imm = 0xAF286BCB
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB10_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %ebp, %eax
+; CHECK-NEXT: imulq %r14, %rax
+; CHECK-NEXT: shrq $32, %rax
+; CHECK-NEXT: movl %ebp, %ecx
+; CHECK-NEXT: subl %eax, %ecx
+; CHECK-NEXT: shrl %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: shrl $4, %ecx
+; CHECK-NEXT: leal (%rcx,%rcx,8), %eax
+; CHECK-NEXT: leal (%rcx,%rax,2), %eax
+; CHECK-NEXT: movl %ebp, %edi
+; CHECK-NEXT: subl %eax, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: incl %ebp
+; CHECK-NEXT: decl %ebx
+; CHECK-NEXT: jne .LBB10_2
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB10_4: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body ], [ 4, %entry ]
+ %rem = urem i32 %i.04, 19
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_no_preheader_non_canonical(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_no_preheader_non_canonical:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB11_1
+; CHECK-NEXT: # %bb.2: # %for.body1
+; CHECK-NEXT: movl $1, %r14d
+; CHECK-NEXT: jmp .LBB11_3
+; CHECK-NEXT: .LBB11_1:
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB11_3: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %r14d, %ebp
+; CHECK-NEXT: jne .LBB11_3
+; CHECK-NEXT: # %bb.4: # %for.cond.cleanup
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.body0, label %for.body1
+
+for.cond.cleanup:
+ ret void
+
+for.body0:
+ br label %for.body
+
+for.body1:
+ br label %for.body
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body ], [ 0, %for.body0 ], [ 1, %for.body1 ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_multi_latch_non_canonical(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_multi_latch_non_canonical:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB12_6
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: decl %ebp
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: jmp .LBB12_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB12_3: # %for.body.backedge
+; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: .LBB12_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: callq get.i1 at PLT
+; CHECK-NEXT: movl %eax, %r15d
+; CHECK-NEXT: callq do_stuff0 at PLT
+; CHECK-NEXT: testb $1, %r15b
+; CHECK-NEXT: je .LBB12_3
+; CHECK-NEXT: # %bb.4: # %for.body0
+; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1
+; CHECK-NEXT: callq do_stuff1 at PLT
+; CHECK-NEXT: cmpl %r14d, %ebp
+; CHECK-NEXT: jne .LBB12_3
+; CHECK-NEXT: # %bb.5:
+; CHECK-NEXT: addq $8, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB12_6: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body ], [ %inc, %for.body0 ], [ 0, %entry ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 1
+ %cond = call i1 @get.i1()
+ call void @do_stuff0()
+ br i1 %cond, label %for.body0, label %for.body
+for.body0:
+ call void @do_stuff1()
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_loop(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_loop:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: callq get.i32 at PLT
+; CHECK-NEXT: testl %eax, %eax
+; CHECK-NEXT: # implicit-def: $r14d
+; CHECK-NEXT: jne .LBB13_4
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: .LBB13_2: # %for.cond
+; CHECK-NEXT: cmpl %ebp, %r14d
+; CHECK-NEXT: jae .LBB13_5
+; CHECK-NEXT: # %bb.3: # %for.body
+; CHECK-NEXT: movl %r14d, %edi
+; CHECK-NEXT: xorl $1, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: .LBB13_4: # %halfway
+; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: jmp .LBB13_2
+; CHECK-NEXT: .LBB13_5: # %for.end
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+entry:
+ %call = call i32 @get.i32()
+ %tobool.not = icmp eq i32 %call, 0
+ br i1 %tobool.not, label %for.cond, label %halfway
+
+for.cond:
+ %i.0 = phi i32 [ %inc, %halfway ], [ 0, %entry ]
+ %cmp = icmp ult i32 %i.0, %N
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ %xor = xor i32 %i.0, 1
+ call void @use.i32(i32 %xor)
+ br label %halfway
+
+halfway:
+ %i.1 = phi i32 [ poison, %entry ], [ %i.0, %for.body ]
+ %rem = urem i32 %i.1, %rem_amt
+ call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.1, 1
+ br label %for.cond
+
+for.end:
+ ret void
+}
+
+define void @simple_urem_fail_intermediate_inc(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_intermediate_inc:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB14_4
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %r14d
+; CHECK-NEXT: negl %r14d
+; CHECK-NEXT: movl $1, %r15d
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB14_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %r15d, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32 at PLT
+; CHECK-NEXT: leal 1(%r14,%r15), %eax
+; CHECK-NEXT: movl %r15d, %ecx
+; CHECK-NEXT: incl %ecx
+; CHECK-NEXT: cmpl $1, %eax
+; CHECK-NEXT: movl %ecx, %r15d
+; CHECK-NEXT: jne .LBB14_2
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .LBB14_4: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %inc2 = add nuw i32 %i.04, 1
+ %rem = urem i32 %inc2, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
>From 616d93ced5ebd0dce953e29dd28a186a2e188681 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Tue, 25 Jun 2024 19:51:39 +0800
Subject: [PATCH 2/2] [CodeGenPrepare] Folding `urem` with loop invariant value
```
for(i = Start; i < End; ++i)
Rem = (i nuw+ IncrLoopInvariant) u% RemAmtLoopInvariant;
```
->
```
Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
for(i = Start; i < End; ++i, ++rem)
Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
```
In its current state, only if `IncrLoopInvariant` and `Start` both
being zero.
Alive2 seemed unable to prove this (see:
https://alive2.llvm.org/ce/z/ATGDp3 which is clearly wrong but still
checks out...) so wrote an exhaustive test here:
https://godbolt.org/z/WYa561388
---
llvm/lib/CodeGen/CodeGenPrepare.cpp | 157 +++++++++++++++++++++
llvm/test/CodeGen/X86/fold-loop-of-urem.ll | 106 ++++++++------
2 files changed, 220 insertions(+), 43 deletions(-)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 900c33b580f15..d2cc3b4cb326c 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -471,6 +471,7 @@ class CodeGenPrepare {
bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, Value *Arg1,
CmpInst *Cmp, Intrinsic::ID IID);
bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT);
+ bool optimizeRem(Instruction *Rem);
bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
void verifyBFIUpdates(Function &F);
@@ -1974,6 +1975,157 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
return true;
}
+static bool isRemOfLoopIncrementWithLIV(Value *Rem, const LoopInfo *LI,
+ Value *&RemAmtOut,
+ std::optional<bool> &AddOrSubOut,
+ Value *&AddOrSubOffsetOut,
+ PHINode *&LoopIncrPNOut) {
+ Value *Incr, *RemAmt;
+ if (!isa<Instruction>(Rem))
+ return false;
+ // NB: If RemAmt is a power of 2 it *should* have been transformed by now.
+ if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt))))
+ return false;
+
+ // Only trivially analyzable loops.
+ Loop *L = LI->getLoopFor(cast<Instruction>(Rem)->getParent());
+ if (L == nullptr || L->getLoopPreheader() == nullptr ||
+ L->getLoopLatch() == nullptr)
+ return false;
+
+ std::optional<bool> AddOrSub;
+ Value *AddOrSubOffset;
+ // Find out loop increment PHI.
+ PHINode *PN = dyn_cast<PHINode>(Incr);
+ if (PN != nullptr) {
+ AddOrSub = std::nullopt;
+ AddOrSubOffset = nullptr;
+ } else {
+ // Search through a NUW add/sub.
+ Value *V0, *V1;
+ if (match(Incr, m_NUWAddLike(m_Value(V0), m_Value(V1))))
+ AddOrSub = true;
+ else if (match(Incr, m_NUWSub(m_Value(V0), m_Value(V1))))
+ AddOrSub = false;
+ else
+ return false;
+
+ PN = dyn_cast<PHINode>(V0);
+ if (PN != nullptr) {
+ AddOrSubOffset = V1;
+ } else if (*AddOrSub) {
+ PN = dyn_cast<PHINode>(V1);
+ AddOrSubOffset = V0;
+ }
+ }
+
+ if (PN == nullptr)
+ return false;
+
+ // This isn't strictly necessary, what we really need is one increment and any
+ // amount of initial values all being the same.
+ if (PN->getNumIncomingValues() != 2)
+ return false;
+
+ // Only works if the remainder amount is a loop invaraint
+ if (!L->isLoopInvariant(RemAmt))
+ return false;
+
+ // Is the PHI a loop increment?
+ auto LoopIncrInfo = getIVIncrement(PN, LI);
+ if (!LoopIncrInfo.has_value())
+ return false;
+
+ // We need remainder_amount % increment_amount to be zero. Increment of one
+ // satisfies that without any special logic and is overwhelmingly the common
+ // case.
+ if (!match(LoopIncrInfo->second, m_One()))
+ return false;
+
+ // Need the increment to not overflow.
+ if (!match(LoopIncrInfo->first, m_NUWAdd(m_Value(), m_Value())))
+ return false;
+
+ // Set output variables.
+ RemAmtOut = RemAmt;
+ LoopIncrPNOut = PN;
+ AddOrSubOut = AddOrSub;
+ AddOrSubOffsetOut = AddOrSubOffset;
+
+ return true;
+}
+
+// Try to transform:
+//
+// for(i = Start; i < End; ++i)
+// Rem = (i nuw+ IncrLoopInvariant) u% RemAmtLoopInvariant;
+//
+// ->
+//
+// Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
+// for(i = Start; i < End; ++i, ++rem)
+// Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
+//
+// Currently only implemented for `Start` and `IncrLoopInvariant` being zero.
+static bool foldURemOfLoopIncrement(Instruction *Rem, const LoopInfo *LI,
+ SmallSet<BasicBlock *, 32> &FreshBBs,
+ bool IsHuge) {
+ std::optional<bool> AddOrSub;
+ Value *AddOrSubOffset, *RemAmt;
+ PHINode *LoopIncrPN;
+ if (!isRemOfLoopIncrementWithLIV(Rem, LI, RemAmt, AddOrSub, AddOrSubOffset,
+ LoopIncrPN))
+ return false;
+
+ // Only non-constant remainder as the extra IV is is probably not profitable
+ // in that case. Further, since remainder amount is non-constant, only handle
+ // case where `IncrLoopInvariant` and `Start` are 0 to entirely eliminate the
+ // rem (as opposed to just hoisting it outside of the loop).
+ //
+ // Potential TODO: Should we have a check for how "nested" this remainder
+ // operation is? The new code runs every iteration so if the remainder is
+ // guarded behind unlikely conditions this might not be worth it.
+ if (AddOrSub.has_value() || match(RemAmt, m_ImmConstant()))
+ return false;
+ Loop *L = LI->getLoopFor(Rem->getParent());
+ if (!match(LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader()),
+ m_Zero()))
+ return false;
+
+ // Create new remainder with induction variable.
+ Type *Ty = Rem->getType();
+ IRBuilder<> Builder(Rem->getContext());
+
+ Builder.SetInsertPoint(LoopIncrPN);
+ PHINode *NewRem = Builder.CreatePHI(Ty, 2);
+
+ Builder.SetInsertPoint(cast<Instruction>(
+ LoopIncrPN->getIncomingValueForBlock(L->getLoopLatch())));
+ // `(add (urem x, y), 1)` is always nuw.
+ Value *RemAdd = Builder.CreateNUWAdd(NewRem, ConstantInt::get(Ty, 1));
+ Value *RemCmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, RemAdd, RemAmt);
+ Value *RemSel =
+ Builder.CreateSelect(RemCmp, Constant::getNullValue(Ty), RemAdd);
+
+ NewRem->addIncoming(Constant::getNullValue(Ty), L->getLoopPreheader());
+ NewRem->addIncoming(RemSel, L->getLoopLatch());
+
+ // Insert all touched BBs.
+ FreshBBs.insert(LoopIncrPN->getParent());
+ FreshBBs.insert(L->getLoopLatch());
+ FreshBBs.insert(Rem->getParent());
+
+ replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge);
+ Rem->eraseFromParent();
+ return true;
+}
+
+bool CodeGenPrepare::optimizeRem(Instruction *Rem) {
+ if (foldURemOfLoopIncrement(Rem, LI, FreshBBs, IsHugeFunc))
+ return true;
+ return false;
+}
+
bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (sinkCmpExpression(Cmp, *TLI))
return true;
@@ -8360,6 +8512,11 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
if (optimizeCmp(Cmp, ModifiedDT))
return true;
+ if (match(I, m_URem(m_Value(), m_Value())) ||
+ match(I, m_SRem(m_Value(), m_Value())))
+ if (optimizeRem(I))
+ return true;
+
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
bool Modified = optimizeLoadExt(LI);
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
index 21e222bf2b1ae..9b093ec259201 100644
--- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -15,25 +15,31 @@ define void @simple_urem_to_sel(i32 %N, i32 %rem_amt) nounwind {
; CHECK-NEXT: je .LBB0_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: xorl %r15d, %r15d
; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: xorl %r12d, %r12d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl %r14d, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divl %ebx
-; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: movl %r14d, %edi
; CHECK-NEXT: callq use.i32 at PLT
; CHECK-NEXT: incl %r14d
-; CHECK-NEXT: cmpl %r14d, %ebp
+; CHECK-NEXT: cmpl %ebx, %r14d
+; CHECK-NEXT: cmovel %r15d, %r14d
+; CHECK-NEXT: incl %r12d
+; CHECK-NEXT: cmpl %r12d, %ebp
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .LBB0_4: # %for.cond.cleanup
; CHECK-NEXT: retq
@@ -60,24 +66,28 @@ define void @simple_urem_to_sel_nested2(i32 %N, i32 %rem_amt) nounwind {
; CHECK-NEXT: je .LBB1_8
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: xorl %r15d, %r15d
; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: xorl %r12d, %r12d
; CHECK-NEXT: jmp .LBB1_2
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_5: # %for.body1
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: movl %r14d, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divl %ebx
-; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: movl %r14d, %edi
; CHECK-NEXT: callq use.i32 at PLT
; CHECK-NEXT: .LBB1_6: # %for.body.tail
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: incl %r14d
-; CHECK-NEXT: cmpl %r14d, %ebp
+; CHECK-NEXT: cmpl %ebx, %r14d
+; CHECK-NEXT: cmovel %r15d, %r14d
+; CHECK-NEXT: incl %r12d
+; CHECK-NEXT: cmpl %r12d, %ebp
; CHECK-NEXT: je .LBB1_7
; CHECK-NEXT: .LBB1_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
@@ -97,7 +107,9 @@ define void @simple_urem_to_sel_nested2(i32 %N, i32 %rem_amt) nounwind {
; CHECK-NEXT: jmp .LBB1_6
; CHECK-NEXT: .LBB1_7:
; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .LBB1_8: # %for.cond.cleanup
; CHECK-NEXT: retq
@@ -213,40 +225,36 @@ for.body.tail:
define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
; CHECK-LABEL: simple_urem_to_sel_vec:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pushq %r14
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: subq $24, %rsp
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
-; CHECK-NEXT: movq %xmm0, %rbx
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-NEXT: movq %xmm0, %r14
+; CHECK-NEXT: pxor %xmm0, %xmm0
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB3_1: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
-; CHECK-NEXT: movq %xmm1, %rax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divq %rbx
-; CHECK-NEXT: movq %rdx, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; CHECK-NEXT: movq %xmm1, %rax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divq %r14
-; CHECK-NEXT: movq %rdx, %xmm1
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: callq use.2xi64 at PLT
-; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: psubq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
+; CHECK-NEXT: psubq %xmm1, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: movdqa %xmm2, %xmm3
+; CHECK-NEXT: pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; CHECK-NEXT: pand %xmm0, %xmm2
+; CHECK-NEXT: pandn %xmm3, %xmm2
+; CHECK-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
+; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: psubq %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: callq get.i1 at PLT
; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: je .LBB3_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT: addq $24, %rsp
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: popq %r14
+; CHECK-NEXT: addq $56, %rsp
; CHECK-NEXT: retq
entry:
br label %for.body
@@ -336,27 +344,33 @@ define void @simple_urem_to_sel_second_acc(i32 %N, i32 %rem_amt) nounwind {
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: movl %edi, %ebp
; CHECK-NEXT: movl $1, %r15d
+; CHECK-NEXT: xorl %r12d, %r12d
; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: xorl %r13d, %r13d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB5_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl %r14d, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divl %ebx
-; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: movl %r14d, %edi
; CHECK-NEXT: callq use.i32 at PLT
; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %ebx, %r14d
+; CHECK-NEXT: cmovel %r12d, %r14d
+; CHECK-NEXT: incl %r13d
; CHECK-NEXT: addl $2, %r15d
; CHECK-NEXT: cmpl %ebp, %r15d
; CHECK-NEXT: jbe .LBB5_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r13
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %r15
; CHECK-NEXT: popq %rbp
@@ -676,23 +690,27 @@ define void @simple_urem_multi_latch_non_canonical(i32 %N, i32 %rem_amt) nounwin
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: movl %edi, %ebp
; CHECK-NEXT: decl %ebp
+; CHECK-NEXT: xorl %r12d, %r12d
; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: xorl %r13d, %r13d
; CHECK-NEXT: jmp .LBB12_2
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB12_3: # %for.body.backedge
; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1
; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %ebx, %r14d
+; CHECK-NEXT: cmovel %r12d, %r14d
+; CHECK-NEXT: incl %r13d
; CHECK-NEXT: .LBB12_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl %r14d, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divl %ebx
-; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: movl %r14d, %edi
; CHECK-NEXT: callq use.i32 at PLT
; CHECK-NEXT: callq get.i1 at PLT
; CHECK-NEXT: movl %eax, %r15d
@@ -702,11 +720,13 @@ define void @simple_urem_multi_latch_non_canonical(i32 %N, i32 %rem_amt) nounwin
; CHECK-NEXT: # %bb.4: # %for.body0
; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1
; CHECK-NEXT: callq do_stuff1 at PLT
-; CHECK-NEXT: cmpl %r14d, %ebp
+; CHECK-NEXT: cmpl %r13d, %ebp
; CHECK-NEXT: jne .LBB12_3
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r13
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %r15
; CHECK-NEXT: popq %rbp
More information about the llvm-commits
mailing list