[llvm] [SelectionDAG] Use Magic Algorithm for Splitting UDIV/UREM by Constant (PR #154968)
Marius Kamp via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 22 08:22:24 PDT 2025
https://github.com/mskamp created https://github.com/llvm/llvm-project/pull/154968
For integer types twice as large as a legal type, we have previously
generated a library call if another splitting technique was not
applicable.
With this change, we use an adaption of the Magic algorithm. This
algorithm is also used for UDIV/UREM by constants on legal types. The
implementation introduced here is a simple port of the already existing
implementation to types twice the size of a legal type. The core idea of
this algorithm is to replace (udiv x c) for a constant c with the bits
higher or equal to the s-th bit of the multiplication of x by (2^s + o)/c
for some s and o. More details are available in Henry S. Warren, Jr.:
"Hacker's Delight", chapter 10.
An efficient handling of UDIV/UREM by constants on types twice as large
as a legal type is mostly relevant for 32-bit platforms. But some
projects may also benefit on 64-bit platforms. For example, the `fmt`
library for C++ uses 128-bit unsigned divisions by 100 and 10000, which
have not been covered by the previously existing optimizations.
Closes #137514.
>From ebb5abea905a549f0b2cc14b3d796e54520e14a4 Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Tue, 12 Aug 2025 18:05:50 +0200
Subject: [PATCH 1/4] [SelectionDAG] Add Tests for Large UDIV/UREM by Constant;
NFC
---
llvm/test/CodeGen/X86/divide-by-constant.ll | 164 +++++++++++++
llvm/test/CodeGen/X86/divmod128.ll | 244 ++++++++++++++++++++
2 files changed, 408 insertions(+)
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index ac78136b9d8ea..14bcc22880697 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -1161,6 +1161,170 @@ entry:
ret i64 %rem
}
+; PR137514
+define i64 @udiv_i64_magic_large_postshift(i64 %x) nounwind {
+; X86-LABEL: udiv_i64_magic_large_postshift:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl $-1073741824 # imm = 0xC0000000
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll __udivdi3
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i64_magic_large_postshift:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: shrq $63, %rax
+; X64-NEXT: retq
+ %ret = udiv i64 %x, 13835058055282163712 ; = 3 * 2^62
+ ret i64 %ret
+}
+
+; PR137514
+define i64 @urem_i64_magic_large_postshift(i64 %x) nounwind {
+; X86-LABEL: urem_i64_magic_large_postshift:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl $-1073741824 # imm = 0xC0000000
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll __umoddi3
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: urem_i64_magic_large_postshift:
+; X64: # %bb.0:
+; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: shrq %rdx
+; X64-NEXT: movabsq $4611686018427387904, %rax # imm = 0x4000000000000000
+; X64-NEXT: andq %rdx, %rax
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: retq
+ %ret = urem i64 %x, 13835058055282163712 ; = 3 * 2^62
+ ret i64 %ret
+}
+
+; PR137514
+define i64 @udiv_i64_magic_large_preshift(i64 %x) nounwind {
+; X86-LABEL: udiv_i64_magic_large_preshift:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl $14
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll __udivdi3
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i64_magic_large_preshift:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq $33, %rax
+; X64-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: retq
+ %ret = udiv i64 %x, 60129542144 ; = 14 * 2^32
+ ret i64 %ret
+}
+
+; PR137514
+define i64 @urem_i64_magic_large_preshift(i64 %x) nounwind {
+; X86-LABEL: urem_i64_magic_large_preshift:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl $14
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll __umoddi3
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: urem_i64_magic_large_preshift:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq $33, %rax
+; X64-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movabsq $60129542144, %rax # imm = 0xE00000000
+; X64-NEXT: imulq %rdx, %rax
+; X64-NEXT: subq %rax, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %ret = urem i64 %x, 60129542144 ; = 14 * 2^32
+ ret i64 %ret
+}
+
+; PR137514
+define i64 @udiv_i64_magic_is_add(i64 %x) nounwind {
+; X86-LABEL: udiv_i64_magic_is_add:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl $196608 # imm = 0x30000
+; X86-NEXT: pushl $-1
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll __udivdi3
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i64_magic_is_add:
+; X64: # %bb.0:
+; X64-NEXT: movabsq $6148789591883185367, %rcx # imm = 0x5554E38E5ED0FCD7
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: shrq %rdi
+; X64-NEXT: leaq (%rdi,%rdx), %rax
+; X64-NEXT: shrq $49, %rax
+; X64-NEXT: retq
+ %ret = udiv i64 %x, 844429225099263 ; = 3 * 2^48 + 2^32 - 1
+ ret i64 %ret
+}
+
+; PR137514
+define i64 @urem_i64_magic_is_add(i64 %x) nounwind {
+; X86-LABEL: urem_i64_magic_is_add:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl $196608 # imm = 0x30000
+; X86-NEXT: pushl $-1
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll __umoddi3
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: urem_i64_magic_is_add:
+; X64: # %bb.0:
+; X64-NEXT: movabsq $6148789591883185367, %rcx # imm = 0x5554E38E5ED0FCD7
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq %rdx, %rax
+; X64-NEXT: shrq %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: shrq $49, %rax
+; X64-NEXT: movabsq $844429225099263, %rcx # imm = 0x30000FFFFFFFF
+; X64-NEXT: imulq %rax, %rcx
+; X64-NEXT: subq %rcx, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %ret = urem i64 %x, 844429225099263 ; = 3 * 2^48 + 2^32 - 1
+ ret i64 %ret
+}
+
; Make sure we don't inline expand for optsize.
define i64 @urem_i64_3_optsize(i64 %x) nounwind optsize {
; X86-LABEL: urem_i64_3_optsize:
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 3796dd796eaf9..9d54452404fb0 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -1013,3 +1013,247 @@ entry:
%rem = urem i128 %x, 3
ret i128 %rem
}
+
+; PR137514
+define i128 @udiv_magic_preshift_and_postshift(i128 %x) nounwind {
+; X86-64-LABEL: udiv_magic_preshift_and_postshift:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movl $22, %edx
+; X86-64-NEXT: xorl %ecx, %ecx
+; X86-64-NEXT: callq __udivti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: udiv_magic_preshift_and_postshift:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $22, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __udivti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = udiv i128 %x, 22
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @urem_magic_preshift_and_postshift(i128 %x) nounwind {
+; X86-64-LABEL: urem_magic_preshift_and_postshift:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movl $22, %edx
+; X86-64-NEXT: xorl %ecx, %ecx
+; X86-64-NEXT: callq __umodti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: urem_magic_preshift_and_postshift:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $22, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __umodti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = urem i128 %x, 22
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @udiv_magic_large_preshift(i128 %x) nounwind {
+; X86-64-LABEL: udiv_magic_large_preshift:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movabsq $755914244096, %rcx # imm = 0xB000000000
+; X86-64-NEXT: xorl %edx, %edx
+; X86-64-NEXT: callq __udivti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: udiv_magic_large_preshift:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000
+; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __udivti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = udiv i128 %x, 13944156602510523416463735259136 ; = 11 * 2^100
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @urem_magic_large_preshift(i128 %x) nounwind {
+; X86-64-LABEL: urem_magic_large_preshift:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movabsq $755914244096, %rcx # imm = 0xB000000000
+; X86-64-NEXT: xorl %edx, %edx
+; X86-64-NEXT: callq __umodti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: urem_magic_large_preshift:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000
+; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __umodti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = urem i128 %x, 13944156602510523416463735259136 ; = 11 * 2^100
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @udiv_magic_large_postshift(i128 %x) nounwind {
+; X86-64-LABEL: udiv_magic_large_postshift:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movl $1, %edx
+; X86-64-NEXT: movl $1, %ecx
+; X86-64-NEXT: callq __udivti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: udiv_magic_large_postshift:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __udivti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = udiv i128 %x, 18446744073709551617 ; = 2^64 + 1
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @urem_magic_large_postshift(i128 %x) nounwind {
+; X86-64-LABEL: urem_magic_large_postshift:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movl $1, %edx
+; X86-64-NEXT: movl $1, %ecx
+; X86-64-NEXT: callq __umodti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: urem_magic_large_postshift:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __umodti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = urem i128 %x, 18446744073709551617 ; = 2^64 + 1
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @udiv_magic_is_add(i128 %x) nounwind {
+; X86-64-LABEL: udiv_magic_is_add:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X86-64-NEXT: movl $1, %edx
+; X86-64-NEXT: callq __udivti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: udiv_magic_is_add:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __udivti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = udiv i128 %x, 170141183460469231731687303715884105729 ; = 2^127 + 1
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @urem_magic_is_add(i128 %x) nounwind {
+; X86-64-LABEL: urem_magic_is_add:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X86-64-NEXT: movl $1, %edx
+; X86-64-NEXT: callq __umodti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: urem_magic_is_add:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __umodti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = urem i128 %x, 170141183460469231731687303715884105729 ; = 2^127 + 1
+ ret i128 %ret
+}
>From 5c02eca1543e8905bd1ee154559ee2404c2f525c Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Tue, 19 Aug 2025 05:39:58 +0200
Subject: [PATCH 2/4] [SelectionDAG] Adjust Existing Tests; NFC
Add new test prefixes to some tests. Currently, these prefixes are
unused but a subsequent commit will change the test result such that
they become necessary.
Furthermore, rename tests that will be folded after a subsequent commit.
---
llvm/test/CodeGen/PowerPC/urem-lkk.ll | 12 +++++++-----
llvm/test/CodeGen/RISCV/urem-lkk.ll | 11 +++++------
llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 11 +++++------
3 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/llvm/test/CodeGen/PowerPC/urem-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-lkk.ll
index 43a1e5a2faf6d..03fd0c0c7e8e2 100644
--- a/llvm/test/CodeGen/PowerPC/urem-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-lkk.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc64 < %s | FileCheck --check-prefix=CHECK %s
-; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc64 < %s | FileCheck --check-prefixes=CHECK,PPC32 %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc < %s | FileCheck --check-prefixes=CHECK,PPC64 %s
define i32 @fold_urem_positive_odd(i32 %x) {
; CHECK-LABEL: fold_urem_positive_odd:
@@ -85,9 +85,8 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) {
ret i32 %1
}
-; Don't fold i64 urem
-define i64 @dont_fold_urem_i64(i64 %x) {
-; CHECK-LABEL: dont_fold_urem_i64:
+define i64 @fold_urem_i64(i64 %x) {
+; CHECK-LABEL: fold_urem_i64:
; CHECK: # %bb.0:
; CHECK-NEXT: mflr 0
; CHECK-NEXT: stwu 1, -16(1)
@@ -104,3 +103,6 @@ define i64 @dont_fold_urem_i64(i64 %x) {
%1 = urem i64 %x, 98
ret i64 %1
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; PPC32: {{.*}}
+; PPC64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index ee496123ba7b4..017b2d36bdd58 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -218,9 +218,8 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) nounwind {
ret i32 %1
}
-; Don't fold i64 urem
-define i64 @dont_fold_urem_i64(i64 %x) nounwind {
-; RV32I-LABEL: dont_fold_urem_i64:
+define i64 @fold_urem_i64(i64 %x) nounwind {
+; RV32I-LABEL: fold_urem_i64:
; RV32I: # %bb.0:
; RV32I-NEXT: addi sp, sp, -16
; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
@@ -231,7 +230,7 @@ define i64 @dont_fold_urem_i64(i64 %x) nounwind {
; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
-; RV32IM-LABEL: dont_fold_urem_i64:
+; RV32IM-LABEL: fold_urem_i64:
; RV32IM: # %bb.0:
; RV32IM-NEXT: addi sp, sp, -16
; RV32IM-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
@@ -242,12 +241,12 @@ define i64 @dont_fold_urem_i64(i64 %x) nounwind {
; RV32IM-NEXT: addi sp, sp, 16
; RV32IM-NEXT: ret
;
-; RV64I-LABEL: dont_fold_urem_i64:
+; RV64I-LABEL: fold_urem_i64:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, 98
; RV64I-NEXT: tail __umoddi3
;
-; RV64IM-LABEL: dont_fold_urem_i64:
+; RV64IM-LABEL: fold_urem_i64:
; RV64IM: # %bb.0:
; RV64IM-NEXT: lui a1, %hi(.LCPI6_0)
; RV64IM-NEXT: ld a1, %lo(.LCPI6_0)(a1)
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 5a3dfd118307d..ec97e7a0ae558 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -778,9 +778,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
ret <4 x i16> %1
}
-; Don't fold i64 urem.
-define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
-; RV32I-LABEL: dont_fold_urem_i64:
+define <4 x i64> @fold_urem_i64(<4 x i64> %x) nounwind {
+; RV32I-LABEL: fold_urem_i64:
; RV32I: # %bb.0:
; RV32I-NEXT: addi sp, sp, -48
; RV32I-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
@@ -850,7 +849,7 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV32I-NEXT: addi sp, sp, 48
; RV32I-NEXT: ret
;
-; RV32IM-LABEL: dont_fold_urem_i64:
+; RV32IM-LABEL: fold_urem_i64:
; RV32IM: # %bb.0:
; RV32IM-NEXT: addi sp, sp, -48
; RV32IM-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
@@ -920,7 +919,7 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV32IM-NEXT: addi sp, sp, 48
; RV32IM-NEXT: ret
;
-; RV64I-LABEL: dont_fold_urem_i64:
+; RV64I-LABEL: fold_urem_i64:
; RV64I: # %bb.0:
; RV64I-NEXT: addi sp, sp, -48
; RV64I-NEXT: sd ra, 40(sp) # 8-byte Folded Spill
@@ -956,7 +955,7 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV64I-NEXT: addi sp, sp, 48
; RV64I-NEXT: ret
;
-; RV64IM-LABEL: dont_fold_urem_i64:
+; RV64IM-LABEL: fold_urem_i64:
; RV64IM: # %bb.0:
; RV64IM-NEXT: ld a2, 8(a1)
; RV64IM-NEXT: ld a3, 16(a1)
>From 04d4d45d7ae9afbc3ab3087812c0772deef6cbfe Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Tue, 15 Jul 2025 16:21:25 +0200
Subject: [PATCH 3/4] [SelectionDAG] Move UREM Decomposition to Own Function;
NFC
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 79 +++++++++++--------
1 file changed, 46 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 402a012e8e555..5862cf6c7112e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8011,25 +8011,12 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
// dividend and multiply by the multiplicative inverse of the shifted divisor.
// If we want the remainder, we shift the value left by the number of trailing
// zeros and add the bits that were shifted out of the dividend.
-bool TargetLowering::expandDIVREMByConstant(SDNode *N,
- SmallVectorImpl<SDValue> &Result,
- EVT HiLoVT, SelectionDAG &DAG,
- SDValue LL, SDValue LH) const {
+static bool expandUDIVREMByConstantViaUREMDecomposition(
+ SDNode *N, APInt Divisor, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
+ SelectionDAG &DAG, SDValue LL, SDValue LH, const TargetLowering &TLI) {
unsigned Opcode = N->getOpcode();
EVT VT = N->getValueType(0);
- // TODO: Support signed division/remainder.
- if (Opcode == ISD::SREM || Opcode == ISD::SDIV || Opcode == ISD::SDIVREM)
- return false;
- assert(
- (Opcode == ISD::UREM || Opcode == ISD::UDIV || Opcode == ISD::UDIVREM) &&
- "Unexpected opcode");
-
- auto *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!CN)
- return false;
-
- APInt Divisor = CN->getAPIntValue();
unsigned BitWidth = Divisor.getBitWidth();
unsigned HBitWidth = BitWidth / 2;
assert(VT.getScalarSizeInBits() == BitWidth &&
@@ -8040,20 +8027,6 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
if (Divisor.uge(HalfMaxPlus1))
return false;
- // We depend on the UREM by constant optimization in DAGCombiner that requires
- // high multiply.
- if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) &&
- !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT))
- return false;
-
- // Don't expand if optimizing for size.
- if (DAG.shouldOptForSize())
- return false;
-
- // Early out for 0 or 1 divisors.
- if (Divisor.ule(1))
- return false;
-
// If the divisor is even, shift it until it becomes odd.
unsigned TrailingZeros = 0;
if (!Divisor[0]) {
@@ -8097,8 +8070,8 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
// Use uaddo_carry if we can, otherwise use a compare to detect overflow.
EVT SetCCType =
- getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
- if (isOperationLegalOrCustom(ISD::UADDO_CARRY, HiLoVT)) {
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
+ if (TLI.isOperationLegalOrCustom(ISD::UADDO_CARRY, HiLoVT)) {
SDVTList VTList = DAG.getVTList(HiLoVT, SetCCType);
Sum = DAG.getNode(ISD::UADDO, dl, VTList, LL, LH);
Sum = DAG.getNode(ISD::UADDO_CARRY, dl, VTList, Sum,
@@ -8108,7 +8081,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, LL, ISD::SETULT);
// If the boolean for the target is 0 or 1, we can add the setcc result
// directly.
- if (getBooleanContents(HiLoVT) ==
+ if (TLI.getBooleanContents(HiLoVT) ==
TargetLoweringBase::ZeroOrOneBooleanContent)
Carry = DAG.getZExtOrTrunc(Carry, dl, HiLoVT);
else
@@ -8164,6 +8137,46 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
return true;
}
+bool TargetLowering::expandDIVREMByConstant(SDNode *N,
+ SmallVectorImpl<SDValue> &Result,
+ EVT HiLoVT, SelectionDAG &DAG,
+ SDValue LL, SDValue LH) const {
+ unsigned Opcode = N->getOpcode();
+
+ // TODO: Support signed division/remainder.
+ if (Opcode == ISD::SREM || Opcode == ISD::SDIV || Opcode == ISD::SDIVREM)
+ return false;
+ assert(
+ (Opcode == ISD::UREM || Opcode == ISD::UDIV || Opcode == ISD::UDIVREM) &&
+ "Unexpected opcode");
+
+ auto *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!CN)
+ return false;
+
+ APInt Divisor = CN->getAPIntValue();
+
+ // We depend on the UREM by constant optimization in DAGCombiner that requires
+ // high multiply.
+ if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) &&
+ !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT))
+ return false;
+
+ // Don't expand if optimizing for size.
+ if (DAG.shouldOptForSize())
+ return false;
+
+ // Early out for 0 or 1 divisors.
+ if (Divisor.ule(1))
+ return false;
+
+ if (expandUDIVREMByConstantViaUREMDecomposition(N, Divisor, Result, HiLoVT,
+ DAG, LL, LH, *this))
+ return true;
+
+ return false;
+}
+
// Check that (every element of) Z is undef or not an exact multiple of BW.
static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) {
return ISD::matchUnaryPredicate(
>From 5bdd489a7640cb10e8339f3c3fee7c6078172577 Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Tue, 12 Aug 2025 16:49:49 +0200
Subject: [PATCH 4/4] [SelectionDAG] Use Magic Algorithm for Splitting
UDIV/UREM by Constant
For integer types twice as large as a legal type, we have previously
generated a library call if another splitting technique was not
applicable.
With this change, we use an adaption of the Magic algorithm. This
algorithm is also used for UDIV/UREM by constants on legal types. The
implementation introduced here is a simple port of the already existing
implementation to types twice the size of a legal type. The core idea of
this algorithm is to replace (udiv x c) for a constant c with the bits
higher or equal to the s-th bit of the multiplication of x by (2^s + o)/c
for some s and o. More details are available in Henry S. Warren, Jr.:
"Hacker's Delight", chapter 10.
An efficient handling of UDIV/UREM by constants on types twice as large
as a legal type is mostly relevant for 32-bit platforms. But some
projects may also benefit on 64-bit platforms. For example, the `fmt`
library for C++ uses 128-bit unsigned divisions by 100 and 10000, which
have not been covered by the previously existing optimizations.
Closes #137514.
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 113 +++
llvm/test/CodeGen/AArch64/rem-by-const.ll | 280 ++++++--
llvm/test/CodeGen/ARM/funnel-shift.ll | 240 ++++---
llvm/test/CodeGen/Mips/funnel-shift.ll | 373 +++++-----
llvm/test/CodeGen/PowerPC/funnel-shift.ll | 312 ++++----
llvm/test/CodeGen/PowerPC/urem-lkk.ll | 96 ++-
llvm/test/CodeGen/RISCV/div-by-constant.ll | 49 +-
.../CodeGen/RISCV/split-udiv-by-constant.ll | 172 ++++-
.../CodeGen/RISCV/split-urem-by-constant.ll | 220 +++++-
llvm/test/CodeGen/RISCV/urem-lkk.ll | 51 +-
llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 193 +++--
llvm/test/CodeGen/X86/divide-by-constant.ll | 410 +++++++++--
llvm/test/CodeGen/X86/divmod128.ll | 670 ++++++++++++++----
llvm/test/CodeGen/X86/funnel-shift.ll | 112 ++-
llvm/test/CodeGen/X86/i128-udiv.ll | 46 +-
15 files changed, 2393 insertions(+), 944 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5862cf6c7112e..9e1a51e291ddb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8137,6 +8137,115 @@ static bool expandUDIVREMByConstantViaUREMDecomposition(
return true;
}
+static bool
+expandUDIVREMByConstantViaUMulHiMagic(SDNode *N, const APInt &Divisor,
+ SmallVectorImpl<SDValue> &Result,
+ EVT HiLoVT, SelectionDAG &DAG, SDValue LL,
+ SDValue LH, const TargetLowering &TLI) {
+
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N0->getValueType(0);
+ SDLoc DL{N};
+
+ assert(!Divisor.isOne() && "Magic algorithm does not work for division by 1");
+
+ // This helper creates a MUL_LOHI of the pair (LL, LH) by a constant.
+ auto MakeMUL_LOHIByConst = [&](unsigned Opc, SDValue LL, SDValue LH,
+ const APInt &Const,
+ SmallVectorImpl<SDValue> &Result) {
+ SDValue LHS = DAG.getNode(ISD::BUILD_PAIR, DL, VT, LL, LH);
+ SDValue RHS = DAG.getConstant(Const, DL, VT);
+ auto [RL, RH] = DAG.SplitScalar(RHS, DL, HiLoVT, HiLoVT);
+ return TLI.expandMUL_LOHI(
+ Opc, VT, DL, LHS, RHS, Result, HiLoVT, DAG,
+ TargetLowering::MulExpansionKind::OnlyLegalOrCustom, LL, LH, RL, RH);
+ };
+
+ // This helper creates an ADD/SUB of the pairs (LL, LH) and (RL, RH).
+ auto MakeAddSubLong = [&](unsigned Opc, SDValue LL, SDValue LH, SDValue RL,
+ SDValue RH) {
+ SDValue AddSubNode =
+ DAG.getNode(Opc == ISD::ADD ? ISD::UADDO : ISD::USUBO, DL,
+ DAG.getVTList(HiLoVT, MVT::i1), LL, RL);
+ SDValue OutL, OutH, Overflow;
+ TLI.expandUADDSUBO(AddSubNode.getNode(), OutL, Overflow, DAG);
+ SDValue WithOverflow = DAG.getNode(
+ Opc, DL, HiLoVT, LH, DAG.getZExtOrTrunc(Overflow, DL, HiLoVT));
+ OutH = DAG.getNode(Opc, DL, HiLoVT, WithOverflow, RH);
+ return std::make_pair(OutL, OutH);
+ };
+
+ // This helper creates a SRL of the pair (LL, LH) by Shift.
+ auto MakeSRLLong = [&](SDValue LL, SDValue LH, unsigned Shift) {
+ unsigned HBitWidth = HiLoVT.getScalarSizeInBits();
+ if (Shift < HBitWidth) {
+ SDValue ShAmt = DAG.getConstant(Shift, DL, HiLoVT);
+ SDValue ResL = DAG.getNode(ISD::FSHR, DL, HiLoVT, LH, LL, ShAmt);
+ SDValue ResH = DAG.getNode(ISD::SRL, DL, HiLoVT, LH, ShAmt);
+ return std::make_pair(ResL, ResH);
+ }
+ SDValue Zero = DAG.getConstant(0, DL, HiLoVT);
+ if (Shift == HBitWidth)
+ return std::make_pair(LH, Zero);
+ assert(Shift - HBitWidth < HBitWidth &&
+ "We shouldn't generate an undefined shift");
+ SDValue ShAmt = DAG.getConstant(Shift - HBitWidth, DL, HiLoVT);
+ return std::make_pair(DAG.getNode(ISD::SRL, DL, HiLoVT, LH, ShAmt), Zero);
+ };
+
+ // Knowledge of leading zeros may help to reduce the multiplier.
+ unsigned KnownLeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros();
+
+ UnsignedDivisionByConstantInfo Magics = UnsignedDivisionByConstantInfo::get(
+ Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero()));
+
+ assert(!LL == !LH && "Expected both input halves or no input halves!");
+ if (!LL)
+ std::tie(LL, LH) = DAG.SplitScalar(N0, DL, HiLoVT, HiLoVT);
+ SDValue QL = LL;
+ SDValue QH = LH;
+ if (Magics.PreShift != 0)
+ std::tie(QL, QH) = MakeSRLLong(QL, QH, Magics.PreShift);
+
+ SmallVector<SDValue, 2> UMulResult;
+ if (!MakeMUL_LOHIByConst(ISD::UMUL_LOHI, QL, QH, Magics.Magic, UMulResult))
+ return false;
+
+ QL = UMulResult[2];
+ QH = UMulResult[3];
+
+ if (Magics.IsAdd) {
+ auto [NPQL, NPQH] = MakeAddSubLong(ISD::SUB, LL, LH, QL, QH);
+ std::tie(NPQL, NPQH) = MakeSRLLong(NPQL, NPQH, 1);
+ std::tie(QL, QH) = MakeAddSubLong(ISD::ADD, NPQL, NPQH, QL, QH);
+ }
+
+ if (Magics.PostShift != 0)
+ std::tie(QL, QH) = MakeSRLLong(QL, QH, Magics.PostShift);
+
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != ISD::UREM) {
+ Result.push_back(QL);
+ Result.push_back(QH);
+ }
+
+ if (Opcode != ISD::UDIV) {
+ SmallVector<SDValue, 2> MulResult;
+ if (!MakeMUL_LOHIByConst(ISD::MUL, QL, QH, Divisor, MulResult))
+ return false;
+
+ assert(MulResult.size() == 2);
+
+ auto [RemL, RemH] =
+ MakeAddSubLong(ISD::SUB, LL, LH, MulResult[0], MulResult[1]);
+
+ Result.push_back(RemL);
+ Result.push_back(RemH);
+ }
+
+ return true;
+}
+
bool TargetLowering::expandDIVREMByConstant(SDNode *N,
SmallVectorImpl<SDValue> &Result,
EVT HiLoVT, SelectionDAG &DAG,
@@ -8174,6 +8283,10 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
DAG, LL, LH, *this))
return true;
+ if (expandUDIVREMByConstantViaUMulHiMagic(N, Divisor, Result, HiLoVT, DAG, LL,
+ LH, *this))
+ return true;
+
return false;
}
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index c57383ad9b1e7..0554b2e66a0be 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -513,13 +513,50 @@ entry:
define i128 @ui128_7(i128 %a, i128 %b) {
; CHECK-SD-LABEL: ui128_7:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: .cfi_offset w30, -16
-; CHECK-SD-NEXT: mov w2, #7 // =0x7
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT: mov x8, #9362 // =0x2492
+; CHECK-SD-NEXT: mov x11, #18725 // =0x4925
+; CHECK-SD-NEXT: movk x8, #37449, lsl #16
+; CHECK-SD-NEXT: movk x11, #9362, lsl #16
+; CHECK-SD-NEXT: movk x8, #18724, lsl #32
+; CHECK-SD-NEXT: movk x11, #37449, lsl #32
+; CHECK-SD-NEXT: movk x8, #9362, lsl #48
+; CHECK-SD-NEXT: movk x11, #18724, lsl #48
+; CHECK-SD-NEXT: mul x10, x0, x8
+; CHECK-SD-NEXT: umulh x12, x0, x11
+; CHECK-SD-NEXT: umulh x9, x0, x8
+; CHECK-SD-NEXT: umulh x14, x1, x11
+; CHECK-SD-NEXT: adds x10, x12, x10
+; CHECK-SD-NEXT: mul x11, x1, x11
+; CHECK-SD-NEXT: cinc x9, x9, hs
+; CHECK-SD-NEXT: umulh x13, x1, x8
+; CHECK-SD-NEXT: mul x8, x1, x8
+; CHECK-SD-NEXT: cmn x10, x11
+; CHECK-SD-NEXT: adcs x9, x9, x14
+; CHECK-SD-NEXT: cinc x10, x13, hs
+; CHECK-SD-NEXT: adds x11, x9, x8
+; CHECK-SD-NEXT: cinc x12, x10, hs
+; CHECK-SD-NEXT: subs x13, x0, x11
+; CHECK-SD-NEXT: cset w14, lo
+; CHECK-SD-NEXT: sub x14, x1, x14
+; CHECK-SD-NEXT: sub x12, x14, x12
+; CHECK-SD-NEXT: extr x13, x12, x13, #1
+; CHECK-SD-NEXT: lsr x12, x12, #1
+; CHECK-SD-NEXT: adds x11, x13, x11
+; CHECK-SD-NEXT: cinc x12, x12, hs
+; CHECK-SD-NEXT: cmn x9, x8
+; CHECK-SD-NEXT: adc x8, x12, x10
+; CHECK-SD-NEXT: mov w10, #7 // =0x7
+; CHECK-SD-NEXT: extr x9, x8, x11, #2
+; CHECK-SD-NEXT: lsr x8, x8, #2
+; CHECK-SD-NEXT: umulh x10, x9, x10
+; CHECK-SD-NEXT: lsl x11, x9, #3
+; CHECK-SD-NEXT: sub x9, x11, x9
+; CHECK-SD-NEXT: subs x0, x0, x9
+; CHECK-SD-NEXT: cset w9, lo
+; CHECK-SD-NEXT: sub x10, x10, x8
+; CHECK-SD-NEXT: sub x9, x1, x9
+; CHECK-SD-NEXT: add x8, x10, x8, lsl #3
+; CHECK-SD-NEXT: sub x1, x9, x8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ui128_7:
@@ -596,13 +633,38 @@ entry:
define i128 @ui128_100(i128 %a, i128 %b) {
; CHECK-SD-LABEL: ui128_100:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: .cfi_offset w30, -16
-; CHECK-SD-NEXT: mov w2, #100 // =0x64
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT: mov x8, #62914 // =0xf5c2
+; CHECK-SD-NEXT: mov x11, #23593 // =0x5c29
+; CHECK-SD-NEXT: movk x8, #23592, lsl #16
+; CHECK-SD-NEXT: movk x11, #49807, lsl #16
+; CHECK-SD-NEXT: movk x8, #49807, lsl #32
+; CHECK-SD-NEXT: movk x11, #10485, lsl #32
+; CHECK-SD-NEXT: movk x8, #10485, lsl #48
+; CHECK-SD-NEXT: movk x11, #36700, lsl #48
+; CHECK-SD-NEXT: mul x10, x0, x8
+; CHECK-SD-NEXT: umulh x12, x0, x11
+; CHECK-SD-NEXT: umulh x9, x0, x8
+; CHECK-SD-NEXT: umulh x14, x1, x11
+; CHECK-SD-NEXT: adds x10, x12, x10
+; CHECK-SD-NEXT: mul x11, x1, x11
+; CHECK-SD-NEXT: cinc x9, x9, hs
+; CHECK-SD-NEXT: umulh x13, x1, x8
+; CHECK-SD-NEXT: mul x8, x1, x8
+; CHECK-SD-NEXT: cmn x10, x11
+; CHECK-SD-NEXT: adcs x9, x9, x14
+; CHECK-SD-NEXT: cinc x10, x13, hs
+; CHECK-SD-NEXT: adds x8, x9, x8
+; CHECK-SD-NEXT: cinc x9, x10, hs
+; CHECK-SD-NEXT: mov w10, #100 // =0x64
+; CHECK-SD-NEXT: extr x8, x9, x8, #4
+; CHECK-SD-NEXT: lsr x9, x9, #4
+; CHECK-SD-NEXT: umulh x11, x8, x10
+; CHECK-SD-NEXT: mul x8, x8, x10
+; CHECK-SD-NEXT: madd x9, x9, x10, x11
+; CHECK-SD-NEXT: subs x0, x0, x8
+; CHECK-SD-NEXT: cset w8, lo
+; CHECK-SD-NEXT: sub x8, x1, x8
+; CHECK-SD-NEXT: sub x1, x8, x9
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ui128_100:
@@ -3204,34 +3266,85 @@ entry:
define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
; CHECK-SD-LABEL: uv2i128_7:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 48
-; CHECK-SD-NEXT: .cfi_offset w19, -8
-; CHECK-SD-NEXT: .cfi_offset w20, -16
-; CHECK-SD-NEXT: .cfi_offset w21, -24
-; CHECK-SD-NEXT: .cfi_offset w22, -32
-; CHECK-SD-NEXT: .cfi_offset w30, -48
-; CHECK-SD-NEXT: mov x19, x3
-; CHECK-SD-NEXT: mov x20, x2
-; CHECK-SD-NEXT: mov w2, #7 // =0x7
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: mov x21, x0
-; CHECK-SD-NEXT: mov x22, x1
-; CHECK-SD-NEXT: mov x0, x20
-; CHECK-SD-NEXT: mov x1, x19
-; CHECK-SD-NEXT: mov w2, #7 // =0x7
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: mov x2, x0
-; CHECK-SD-NEXT: mov x3, x1
-; CHECK-SD-NEXT: mov x0, x21
-; CHECK-SD-NEXT: mov x1, x22
-; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT: mov x8, #9362 // =0x2492
+; CHECK-SD-NEXT: mov x11, #18725 // =0x4925
+; CHECK-SD-NEXT: movk x8, #37449, lsl #16
+; CHECK-SD-NEXT: movk x11, #9362, lsl #16
+; CHECK-SD-NEXT: movk x8, #18724, lsl #32
+; CHECK-SD-NEXT: movk x11, #37449, lsl #32
+; CHECK-SD-NEXT: movk x8, #9362, lsl #48
+; CHECK-SD-NEXT: movk x11, #18724, lsl #48
+; CHECK-SD-NEXT: mul x10, x0, x8
+; CHECK-SD-NEXT: umulh x12, x0, x11
+; CHECK-SD-NEXT: umulh x9, x0, x8
+; CHECK-SD-NEXT: mul x15, x1, x11
+; CHECK-SD-NEXT: adds x10, x12, x10
+; CHECK-SD-NEXT: umulh x14, x1, x11
+; CHECK-SD-NEXT: cinc x9, x9, hs
+; CHECK-SD-NEXT: umulh x13, x1, x8
+; CHECK-SD-NEXT: cmn x10, x15
+; CHECK-SD-NEXT: mul x16, x1, x8
+; CHECK-SD-NEXT: adcs x9, x9, x14
+; CHECK-SD-NEXT: mul x12, x2, x8
+; CHECK-SD-NEXT: cinc x13, x13, hs
+; CHECK-SD-NEXT: umulh x10, x2, x11
+; CHECK-SD-NEXT: adds x14, x9, x16
+; CHECK-SD-NEXT: cinc x15, x13, hs
+; CHECK-SD-NEXT: subs x18, x0, x14
+; CHECK-SD-NEXT: umulh x17, x2, x8
+; CHECK-SD-NEXT: cset w5, lo
+; CHECK-SD-NEXT: sub x5, x1, x5
+; CHECK-SD-NEXT: umulh x6, x3, x11
+; CHECK-SD-NEXT: sub x15, x5, x15
+; CHECK-SD-NEXT: extr x18, x15, x18, #1
+; CHECK-SD-NEXT: mul x11, x3, x11
+; CHECK-SD-NEXT: lsr x15, x15, #1
+; CHECK-SD-NEXT: umulh x4, x3, x8
+; CHECK-SD-NEXT: adds x14, x18, x14
+; CHECK-SD-NEXT: cinc x15, x15, hs
+; CHECK-SD-NEXT: cmn x9, x16
+; CHECK-SD-NEXT: mul x8, x3, x8
+; CHECK-SD-NEXT: adc x9, x15, x13
+; CHECK-SD-NEXT: adds x10, x10, x12
+; CHECK-SD-NEXT: cinc x12, x17, hs
+; CHECK-SD-NEXT: cmn x10, x11
+; CHECK-SD-NEXT: adcs x10, x12, x6
+; CHECK-SD-NEXT: cinc x11, x4, hs
+; CHECK-SD-NEXT: adds x12, x10, x8
+; CHECK-SD-NEXT: cinc x13, x11, hs
+; CHECK-SD-NEXT: subs x15, x2, x12
+; CHECK-SD-NEXT: cset w16, lo
+; CHECK-SD-NEXT: sub x16, x3, x16
+; CHECK-SD-NEXT: sub x13, x16, x13
+; CHECK-SD-NEXT: extr x15, x13, x15, #1
+; CHECK-SD-NEXT: lsr x13, x13, #1
+; CHECK-SD-NEXT: adds x12, x15, x12
+; CHECK-SD-NEXT: cinc x13, x13, hs
+; CHECK-SD-NEXT: cmn x10, x8
+; CHECK-SD-NEXT: extr x8, x9, x14, #2
+; CHECK-SD-NEXT: adc x10, x13, x11
+; CHECK-SD-NEXT: mov w11, #7 // =0x7
+; CHECK-SD-NEXT: lsr x9, x9, #2
+; CHECK-SD-NEXT: extr x12, x10, x12, #2
+; CHECK-SD-NEXT: umulh x13, x8, x11
+; CHECK-SD-NEXT: lsl x14, x8, #3
+; CHECK-SD-NEXT: lsr x10, x10, #2
+; CHECK-SD-NEXT: umulh x11, x12, x11
+; CHECK-SD-NEXT: lsl x15, x12, #3
+; CHECK-SD-NEXT: sub x8, x14, x8
+; CHECK-SD-NEXT: subs x0, x0, x8
+; CHECK-SD-NEXT: sub x8, x15, x12
+; CHECK-SD-NEXT: cset w12, lo
+; CHECK-SD-NEXT: sub x13, x13, x9
+; CHECK-SD-NEXT: subs x2, x2, x8
+; CHECK-SD-NEXT: add x8, x13, x9, lsl #3
+; CHECK-SD-NEXT: sub x11, x11, x10
+; CHECK-SD-NEXT: add x9, x11, x10, lsl #3
+; CHECK-SD-NEXT: cset w10, lo
+; CHECK-SD-NEXT: sub x11, x1, x12
+; CHECK-SD-NEXT: sub x10, x3, x10
+; CHECK-SD-NEXT: sub x1, x11, x8
+; CHECK-SD-NEXT: sub x3, x10, x9
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: uv2i128_7:
@@ -3361,34 +3474,61 @@ entry:
define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
; CHECK-SD-LABEL: uv2i128_100:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 48
-; CHECK-SD-NEXT: .cfi_offset w19, -8
-; CHECK-SD-NEXT: .cfi_offset w20, -16
-; CHECK-SD-NEXT: .cfi_offset w21, -24
-; CHECK-SD-NEXT: .cfi_offset w22, -32
-; CHECK-SD-NEXT: .cfi_offset w30, -48
-; CHECK-SD-NEXT: mov x19, x3
-; CHECK-SD-NEXT: mov x20, x2
-; CHECK-SD-NEXT: mov w2, #100 // =0x64
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: mov x21, x0
-; CHECK-SD-NEXT: mov x22, x1
-; CHECK-SD-NEXT: mov x0, x20
-; CHECK-SD-NEXT: mov x1, x19
-; CHECK-SD-NEXT: mov w2, #100 // =0x64
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: mov x2, x0
-; CHECK-SD-NEXT: mov x3, x1
-; CHECK-SD-NEXT: mov x0, x21
-; CHECK-SD-NEXT: mov x1, x22
-; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT: mov x8, #62914 // =0xf5c2
+; CHECK-SD-NEXT: mov x11, #23593 // =0x5c29
+; CHECK-SD-NEXT: movk x8, #23592, lsl #16
+; CHECK-SD-NEXT: movk x11, #49807, lsl #16
+; CHECK-SD-NEXT: movk x8, #49807, lsl #32
+; CHECK-SD-NEXT: movk x11, #10485, lsl #32
+; CHECK-SD-NEXT: movk x8, #10485, lsl #48
+; CHECK-SD-NEXT: movk x11, #36700, lsl #48
+; CHECK-SD-NEXT: mul x10, x0, x8
+; CHECK-SD-NEXT: umulh x12, x0, x11
+; CHECK-SD-NEXT: umulh x9, x0, x8
+; CHECK-SD-NEXT: mul x15, x1, x11
+; CHECK-SD-NEXT: adds x10, x12, x10
+; CHECK-SD-NEXT: mov w12, #100 // =0x64
+; CHECK-SD-NEXT: umulh x14, x1, x11
+; CHECK-SD-NEXT: cinc x9, x9, hs
+; CHECK-SD-NEXT: umulh x13, x1, x8
+; CHECK-SD-NEXT: cmn x10, x15
+; CHECK-SD-NEXT: mul x16, x1, x8
+; CHECK-SD-NEXT: adcs x9, x9, x14
+; CHECK-SD-NEXT: mul x15, x2, x8
+; CHECK-SD-NEXT: cinc x10, x13, hs
+; CHECK-SD-NEXT: umulh x14, x2, x8
+; CHECK-SD-NEXT: adds x9, x9, x16
+; CHECK-SD-NEXT: cinc x10, x10, hs
+; CHECK-SD-NEXT: umulh x16, x2, x11
+; CHECK-SD-NEXT: extr x9, x10, x9, #4
+; CHECK-SD-NEXT: lsr x10, x10, #4
+; CHECK-SD-NEXT: umulh x18, x3, x11
+; CHECK-SD-NEXT: mul x13, x9, x12
+; CHECK-SD-NEXT: mul x11, x3, x11
+; CHECK-SD-NEXT: umulh x17, x3, x8
+; CHECK-SD-NEXT: subs x0, x0, x13
+; CHECK-SD-NEXT: mul x8, x3, x8
+; CHECK-SD-NEXT: cset w13, lo
+; CHECK-SD-NEXT: adds x15, x16, x15
+; CHECK-SD-NEXT: cinc x14, x14, hs
+; CHECK-SD-NEXT: cmn x15, x11
+; CHECK-SD-NEXT: adcs x11, x14, x18
+; CHECK-SD-NEXT: umulh x9, x9, x12
+; CHECK-SD-NEXT: cinc x14, x17, hs
+; CHECK-SD-NEXT: adds x8, x11, x8
+; CHECK-SD-NEXT: madd x9, x10, x12, x9
+; CHECK-SD-NEXT: cinc x11, x14, hs
+; CHECK-SD-NEXT: extr x8, x11, x8, #4
+; CHECK-SD-NEXT: lsr x11, x11, #4
+; CHECK-SD-NEXT: umulh x14, x8, x12
+; CHECK-SD-NEXT: mul x8, x8, x12
+; CHECK-SD-NEXT: madd x10, x11, x12, x14
+; CHECK-SD-NEXT: sub x11, x1, x13
+; CHECK-SD-NEXT: sub x1, x11, x9
+; CHECK-SD-NEXT: subs x2, x2, x8
+; CHECK-SD-NEXT: cset w8, lo
+; CHECK-SD-NEXT: sub x8, x3, x8
+; CHECK-SD-NEXT: sub x3, x8, x10
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: uv2i128_100:
diff --git a/llvm/test/CodeGen/ARM/funnel-shift.ll b/llvm/test/CodeGen/ARM/funnel-shift.ll
index 191155ae30f3e..77bed94918f2a 100644
--- a/llvm/test/CodeGen/ARM/funnel-shift.ll
+++ b/llvm/test/CodeGen/ARM/funnel-shift.ll
@@ -47,67 +47,77 @@ declare i37 @llvm.fshl.i37(i37, i37, i37)
define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
; SCALAR-LABEL: fshl_i37:
; SCALAR: @ %bb.0:
-; SCALAR-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; SCALAR-NEXT: push {r4, r5, r6, r7, r8, lr}
-; SCALAR-NEXT: mov r8, r0
-; SCALAR-NEXT: ldr r0, [sp, #28]
-; SCALAR-NEXT: mov r4, r1
-; SCALAR-NEXT: mov r5, r3
-; SCALAR-NEXT: and r1, r0, #31
-; SCALAR-NEXT: ldr r0, [sp, #24]
-; SCALAR-NEXT: mov r6, r2
-; SCALAR-NEXT: mov r2, #37
-; SCALAR-NEXT: mov r3, #0
-; SCALAR-NEXT: bl __aeabi_uldivmod
-; SCALAR-NEXT: lsl r0, r5, #27
-; SCALAR-NEXT: tst r2, #32
-; SCALAR-NEXT: orr r0, r0, r6, lsr #5
-; SCALAR-NEXT: mov r1, r8
-; SCALAR-NEXT: and r3, r2, #31
-; SCALAR-NEXT: mov r7, #31
+; SCALAR-NEXT: .save {r4, r5, r6, r7, r11, lr}
+; SCALAR-NEXT: push {r4, r5, r6, r7, r11, lr}
+; SCALAR-NEXT: ldr lr, [sp, #24]
+; SCALAR-NEXT: movw r12, #46053
+; SCALAR-NEXT: movt r12, #12398
+; SCALAR-NEXT: movw r6, #15941
+; SCALAR-NEXT: ldr r7, [sp, #28]
+; SCALAR-NEXT: movt r6, #1771
+; SCALAR-NEXT: umull r4, r5, lr, r12
+; SCALAR-NEXT: lsl r3, r3, #27
+; SCALAR-NEXT: mov r4, #0
+; SCALAR-NEXT: and r7, r7, #31
+; SCALAR-NEXT: umlal r5, r4, lr, r6
+; SCALAR-NEXT: orr r3, r3, r2, lsr #5
+; SCALAR-NEXT: umlal r5, r4, r7, r12
+; SCALAR-NEXT: mla r7, r7, r6, r4
+; SCALAR-NEXT: mov r6, #37
+; SCALAR-NEXT: mls r7, r7, r6, lr
+; SCALAR-NEXT: mov r6, r0
+; SCALAR-NEXT: tst r7, #32
+; SCALAR-NEXT: and r5, r7, #31
+; SCALAR-NEXT: movne r6, r3
+; SCALAR-NEXT: lslne r3, r2, #27
+; SCALAR-NEXT: lsr r2, r3, #1
+; SCALAR-NEXT: mov r3, #31
+; SCALAR-NEXT: bic r3, r3, r7
; SCALAR-NEXT: movne r1, r0
-; SCALAR-NEXT: lslne r0, r6, #27
-; SCALAR-NEXT: bic r2, r7, r2
-; SCALAR-NEXT: lsl r5, r1, r3
-; SCALAR-NEXT: lsr r0, r0, #1
-; SCALAR-NEXT: movne r4, r8
-; SCALAR-NEXT: lsr r1, r1, #1
-; SCALAR-NEXT: lsl r3, r4, r3
-; SCALAR-NEXT: orr r0, r5, r0, lsr r2
-; SCALAR-NEXT: orr r1, r3, r1, lsr r2
-; SCALAR-NEXT: pop {r4, r5, r6, r7, r8, pc}
+; SCALAR-NEXT: lsl r4, r6, r5
+; SCALAR-NEXT: lsl r0, r1, r5
+; SCALAR-NEXT: lsr r1, r6, #1
+; SCALAR-NEXT: orr r2, r4, r2, lsr r3
+; SCALAR-NEXT: orr r1, r0, r1, lsr r3
+; SCALAR-NEXT: mov r0, r2
+; SCALAR-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; NEON-LABEL: fshl_i37:
; NEON: @ %bb.0:
-; NEON-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; NEON-NEXT: push {r4, r5, r6, r7, r8, lr}
-; NEON-NEXT: mov r4, r1
-; NEON-NEXT: ldr r1, [sp, #28]
-; NEON-NEXT: mov r8, r0
-; NEON-NEXT: ldr r0, [sp, #24]
-; NEON-NEXT: and r1, r1, #31
-; NEON-NEXT: mov r5, r3
-; NEON-NEXT: mov r6, r2
-; NEON-NEXT: mov r2, #37
-; NEON-NEXT: mov r3, #0
-; NEON-NEXT: bl __aeabi_uldivmod
-; NEON-NEXT: lsl r0, r5, #27
-; NEON-NEXT: tst r2, #32
-; NEON-NEXT: orr r0, r0, r6, lsr #5
-; NEON-NEXT: mov r1, r8
-; NEON-NEXT: and r3, r2, #31
-; NEON-NEXT: mov r7, #31
+; NEON-NEXT: .save {r4, r5, r6, r7, r11, lr}
+; NEON-NEXT: push {r4, r5, r6, r7, r11, lr}
+; NEON-NEXT: ldr r12, [sp, #24]
+; NEON-NEXT: movw lr, #46053
+; NEON-NEXT: movt lr, #12398
+; NEON-NEXT: ldr r6, [sp, #28]
+; NEON-NEXT: mov r7, #0
+; NEON-NEXT: lsl r3, r3, #27
+; NEON-NEXT: umull r4, r5, r12, lr
+; NEON-NEXT: and r6, r6, #31
+; NEON-NEXT: movw r4, #15941
+; NEON-NEXT: movt r4, #1771
+; NEON-NEXT: umlal r5, r7, r12, r4
+; NEON-NEXT: orr r3, r3, r2, lsr #5
+; NEON-NEXT: umlal r5, r7, r6, lr
+; NEON-NEXT: mla r7, r6, r4, r7
+; NEON-NEXT: mov r6, #37
+; NEON-NEXT: mls r7, r7, r6, r12
+; NEON-NEXT: mov r6, r0
+; NEON-NEXT: tst r7, #32
+; NEON-NEXT: and r5, r7, #31
+; NEON-NEXT: movne r6, r3
+; NEON-NEXT: lslne r3, r2, #27
+; NEON-NEXT: lsr r2, r3, #1
+; NEON-NEXT: mov r3, #31
+; NEON-NEXT: bic r3, r3, r7
; NEON-NEXT: movne r1, r0
-; NEON-NEXT: lslne r0, r6, #27
-; NEON-NEXT: bic r2, r7, r2
-; NEON-NEXT: lsl r5, r1, r3
-; NEON-NEXT: lsr r0, r0, #1
-; NEON-NEXT: movne r4, r8
-; NEON-NEXT: lsr r1, r1, #1
-; NEON-NEXT: lsl r3, r4, r3
-; NEON-NEXT: orr r0, r5, r0, lsr r2
-; NEON-NEXT: orr r1, r3, r1, lsr r2
-; NEON-NEXT: pop {r4, r5, r6, r7, r8, pc}
+; NEON-NEXT: lsl r4, r6, r5
+; NEON-NEXT: lsl r0, r1, r5
+; NEON-NEXT: lsr r1, r6, #1
+; NEON-NEXT: orr r2, r4, r2, lsr r3
+; NEON-NEXT: orr r1, r0, r1, lsr r3
+; NEON-NEXT: mov r0, r2
+; NEON-NEXT: pop {r4, r5, r6, r7, r11, pc}
%f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
}
@@ -237,66 +247,76 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
; SCALAR: @ %bb.0:
; SCALAR-NEXT: .save {r4, r5, r6, r7, r11, lr}
; SCALAR-NEXT: push {r4, r5, r6, r7, r11, lr}
-; SCALAR-NEXT: mov r5, r0
-; SCALAR-NEXT: ldr r0, [sp, #28]
-; SCALAR-NEXT: mov r4, r1
-; SCALAR-NEXT: mov r6, r3
-; SCALAR-NEXT: and r1, r0, #31
-; SCALAR-NEXT: ldr r0, [sp, #24]
-; SCALAR-NEXT: mov r7, r2
-; SCALAR-NEXT: mov r2, #37
-; SCALAR-NEXT: mov r3, #0
-; SCALAR-NEXT: bl __aeabi_uldivmod
-; SCALAR-NEXT: add r0, r2, #27
-; SCALAR-NEXT: lsl r2, r6, #27
-; SCALAR-NEXT: orr r2, r2, r7, lsr #5
-; SCALAR-NEXT: mov r1, #31
-; SCALAR-NEXT: tst r0, #32
-; SCALAR-NEXT: mov r3, r5
-; SCALAR-NEXT: moveq r3, r2
-; SCALAR-NEXT: lsleq r2, r7, #27
-; SCALAR-NEXT: bic r1, r1, r0
-; SCALAR-NEXT: and r7, r0, #31
-; SCALAR-NEXT: lsl r6, r3, #1
-; SCALAR-NEXT: moveq r4, r5
-; SCALAR-NEXT: lsl r6, r6, r1
-; SCALAR-NEXT: orr r0, r6, r2, lsr r7
-; SCALAR-NEXT: lsl r2, r4, #1
-; SCALAR-NEXT: lsl r1, r2, r1
-; SCALAR-NEXT: orr r1, r1, r3, lsr r7
+; SCALAR-NEXT: ldr lr, [sp, #24]
+; SCALAR-NEXT: movw r12, #46053
+; SCALAR-NEXT: movt r12, #12398
+; SCALAR-NEXT: movw r6, #15941
+; SCALAR-NEXT: ldr r7, [sp, #28]
+; SCALAR-NEXT: movt r6, #1771
+; SCALAR-NEXT: umull r4, r5, lr, r12
+; SCALAR-NEXT: lsl r3, r3, #27
+; SCALAR-NEXT: mov r4, #0
+; SCALAR-NEXT: and r7, r7, #31
+; SCALAR-NEXT: umlal r5, r4, lr, r6
+; SCALAR-NEXT: orr r3, r3, r2, lsr #5
+; SCALAR-NEXT: umlal r5, r4, r7, r12
+; SCALAR-NEXT: mov r5, #31
+; SCALAR-NEXT: mla r7, r7, r6, r4
+; SCALAR-NEXT: mov r6, #37
+; SCALAR-NEXT: mls r7, r7, r6, lr
+; SCALAR-NEXT: mov r6, r0
+; SCALAR-NEXT: add r7, r7, #27
+; SCALAR-NEXT: tst r7, #32
+; SCALAR-NEXT: bic r5, r5, r7
+; SCALAR-NEXT: moveq r6, r3
+; SCALAR-NEXT: lsleq r3, r2, #27
+; SCALAR-NEXT: lsl r2, r6, #1
+; SCALAR-NEXT: and r7, r7, #31
+; SCALAR-NEXT: lsl r2, r2, r5
+; SCALAR-NEXT: moveq r1, r0
+; SCALAR-NEXT: lsl r0, r1, #1
+; SCALAR-NEXT: orr r2, r2, r3, lsr r7
+; SCALAR-NEXT: lsl r0, r0, r5
+; SCALAR-NEXT: orr r1, r0, r6, lsr r7
+; SCALAR-NEXT: mov r0, r2
; SCALAR-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; NEON-LABEL: fshr_i37:
; NEON: @ %bb.0:
; NEON-NEXT: .save {r4, r5, r6, r7, r11, lr}
; NEON-NEXT: push {r4, r5, r6, r7, r11, lr}
-; NEON-NEXT: mov r4, r1
-; NEON-NEXT: ldr r1, [sp, #28]
-; NEON-NEXT: mov r5, r0
-; NEON-NEXT: ldr r0, [sp, #24]
-; NEON-NEXT: and r1, r1, #31
-; NEON-NEXT: mov r6, r3
-; NEON-NEXT: mov r7, r2
-; NEON-NEXT: mov r2, #37
-; NEON-NEXT: mov r3, #0
-; NEON-NEXT: bl __aeabi_uldivmod
-; NEON-NEXT: add r0, r2, #27
-; NEON-NEXT: lsl r2, r6, #27
-; NEON-NEXT: orr r2, r2, r7, lsr #5
-; NEON-NEXT: mov r1, #31
-; NEON-NEXT: tst r0, #32
-; NEON-NEXT: mov r3, r5
-; NEON-NEXT: moveq r3, r2
-; NEON-NEXT: lsleq r2, r7, #27
-; NEON-NEXT: bic r1, r1, r0
-; NEON-NEXT: and r7, r0, #31
-; NEON-NEXT: lsl r6, r3, #1
-; NEON-NEXT: moveq r4, r5
-; NEON-NEXT: lsl r6, r6, r1
-; NEON-NEXT: orr r0, r6, r2, lsr r7
-; NEON-NEXT: lsl r2, r4, #1
-; NEON-NEXT: lsl r1, r2, r1
-; NEON-NEXT: orr r1, r1, r3, lsr r7
+; NEON-NEXT: ldr r12, [sp, #24]
+; NEON-NEXT: movw lr, #46053
+; NEON-NEXT: movt lr, #12398
+; NEON-NEXT: ldr r6, [sp, #28]
+; NEON-NEXT: mov r7, #0
+; NEON-NEXT: lsl r3, r3, #27
+; NEON-NEXT: umull r4, r5, r12, lr
+; NEON-NEXT: and r6, r6, #31
+; NEON-NEXT: movw r4, #15941
+; NEON-NEXT: movt r4, #1771
+; NEON-NEXT: umlal r5, r7, r12, r4
+; NEON-NEXT: orr r3, r3, r2, lsr #5
+; NEON-NEXT: umlal r5, r7, r6, lr
+; NEON-NEXT: mov r5, #31
+; NEON-NEXT: mla r7, r6, r4, r7
+; NEON-NEXT: mov r6, #37
+; NEON-NEXT: mls r7, r7, r6, r12
+; NEON-NEXT: mov r6, r0
+; NEON-NEXT: add r7, r7, #27
+; NEON-NEXT: tst r7, #32
+; NEON-NEXT: bic r5, r5, r7
+; NEON-NEXT: moveq r6, r3
+; NEON-NEXT: lsleq r3, r2, #27
+; NEON-NEXT: lsl r2, r6, #1
+; NEON-NEXT: and r7, r7, #31
+; NEON-NEXT: lsl r2, r2, r5
+; NEON-NEXT: moveq r1, r0
+; NEON-NEXT: lsl r0, r1, #1
+; NEON-NEXT: orr r2, r2, r3, lsr r7
+; NEON-NEXT: lsl r0, r0, r5
+; NEON-NEXT: orr r1, r0, r6, lsr r7
+; NEON-NEXT: mov r0, r2
; NEON-NEXT: pop {r4, r5, r6, r7, r11, pc}
%f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
diff --git a/llvm/test/CodeGen/Mips/funnel-shift.ll b/llvm/test/CodeGen/Mips/funnel-shift.ll
index 99e0d47441a02..51e212c8c5ae0 100644
--- a/llvm/test/CodeGen/Mips/funnel-shift.ll
+++ b/llvm/test/CodeGen/Mips/funnel-shift.ll
@@ -48,105 +48,106 @@ declare i37 @llvm.fshl.i37(i37, i37, i37)
define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-BE-LABEL: fshl_i37:
; CHECK-BE: # %bb.0:
-; CHECK-BE-NEXT: addiu $sp, $sp, -40
-; CHECK-BE-NEXT: .cfi_def_cfa_offset 40
-; CHECK-BE-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $19, 32($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $18, 28($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: .cfi_offset 31, -4
-; CHECK-BE-NEXT: .cfi_offset 19, -8
-; CHECK-BE-NEXT: .cfi_offset 18, -12
-; CHECK-BE-NEXT: .cfi_offset 17, -16
-; CHECK-BE-NEXT: .cfi_offset 16, -20
-; CHECK-BE-NEXT: move $16, $7
-; CHECK-BE-NEXT: move $17, $6
-; CHECK-BE-NEXT: move $18, $5
-; CHECK-BE-NEXT: move $19, $4
-; CHECK-BE-NEXT: lw $1, 56($sp)
-; CHECK-BE-NEXT: andi $4, $1, 31
-; CHECK-BE-NEXT: lw $5, 60($sp)
-; CHECK-BE-NEXT: addiu $6, $zero, 0
-; CHECK-BE-NEXT: jal __umoddi3
-; CHECK-BE-NEXT: addiu $7, $zero, 37
-; CHECK-BE-NEXT: srl $1, $3, 5
-; CHECK-BE-NEXT: andi $1, $1, 1
-; CHECK-BE-NEXT: movn $19, $18, $1
-; CHECK-BE-NEXT: sllv $2, $19, $3
-; CHECK-BE-NEXT: not $4, $3
-; CHECK-BE-NEXT: srl $5, $16, 5
-; CHECK-BE-NEXT: sll $6, $17, 27
-; CHECK-BE-NEXT: or $5, $6, $5
-; CHECK-BE-NEXT: movn $18, $5, $1
-; CHECK-BE-NEXT: srl $6, $18, 1
-; CHECK-BE-NEXT: srlv $6, $6, $4
-; CHECK-BE-NEXT: or $2, $2, $6
-; CHECK-BE-NEXT: sllv $3, $18, $3
-; CHECK-BE-NEXT: sll $6, $16, 27
-; CHECK-BE-NEXT: movn $5, $6, $1
-; CHECK-BE-NEXT: srl $1, $5, 1
-; CHECK-BE-NEXT: srlv $1, $1, $4
-; CHECK-BE-NEXT: or $3, $3, $1
-; CHECK-BE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $19, 32($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
+; CHECK-BE-NEXT: lui $1, 1771
+; CHECK-BE-NEXT: ori $1, $1, 15941
+; CHECK-BE-NEXT: lw $2, 20($sp)
+; CHECK-BE-NEXT: multu $2, $1
+; CHECK-BE-NEXT: mfhi $3
+; CHECK-BE-NEXT: mflo $8
+; CHECK-BE-NEXT: lui $9, 12398
+; CHECK-BE-NEXT: ori $9, $9, 46053
+; CHECK-BE-NEXT: multu $2, $9
+; CHECK-BE-NEXT: mfhi $10
+; CHECK-BE-NEXT: lw $11, 16($sp)
+; CHECK-BE-NEXT: andi $11, $11, 31
+; CHECK-BE-NEXT: multu $11, $9
+; CHECK-BE-NEXT: mflo $9
+; CHECK-BE-NEXT: mfhi $12
+; CHECK-BE-NEXT: addu $8, $10, $8
+; CHECK-BE-NEXT: sltu $10, $8, $10
+; CHECK-BE-NEXT: addu $9, $8, $9
+; CHECK-BE-NEXT: sltu $8, $9, $8
+; CHECK-BE-NEXT: addu $3, $3, $10
+; CHECK-BE-NEXT: srl $9, $7, 5
+; CHECK-BE-NEXT: sll $6, $6, 27
+; CHECK-BE-NEXT: or $6, $6, $9
+; CHECK-BE-NEXT: addu $3, $3, $12
+; CHECK-BE-NEXT: sll $7, $7, 27
+; CHECK-BE-NEXT: addu $3, $3, $8
+; CHECK-BE-NEXT: mul $1, $11, $1
+; CHECK-BE-NEXT: addu $1, $3, $1
+; CHECK-BE-NEXT: sll $3, $1, 2
+; CHECK-BE-NEXT: addu $3, $3, $1
+; CHECK-BE-NEXT: sll $1, $1, 5
+; CHECK-BE-NEXT: addu $1, $1, $3
+; CHECK-BE-NEXT: subu $1, $2, $1
+; CHECK-BE-NEXT: andi $2, $1, 32
+; CHECK-BE-NEXT: srl $3, $2, 5
+; CHECK-BE-NEXT: movn $4, $5, $3
+; CHECK-BE-NEXT: sllv $2, $4, $1
+; CHECK-BE-NEXT: not $4, $1
+; CHECK-BE-NEXT: movn $5, $6, $3
+; CHECK-BE-NEXT: srl $8, $5, 1
+; CHECK-BE-NEXT: srlv $8, $8, $4
+; CHECK-BE-NEXT: or $2, $2, $8
+; CHECK-BE-NEXT: sllv $1, $5, $1
+; CHECK-BE-NEXT: movn $6, $7, $3
+; CHECK-BE-NEXT: srl $3, $6, 1
+; CHECK-BE-NEXT: srlv $3, $3, $4
; CHECK-BE-NEXT: jr $ra
-; CHECK-BE-NEXT: addiu $sp, $sp, 40
+; CHECK-BE-NEXT: or $3, $1, $3
;
; CHECK-LE-LABEL: fshl_i37:
; CHECK-LE: # %bb.0:
-; CHECK-LE-NEXT: addiu $sp, $sp, -40
-; CHECK-LE-NEXT: .cfi_def_cfa_offset 40
-; CHECK-LE-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $19, 32($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $18, 28($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: .cfi_offset 31, -4
-; CHECK-LE-NEXT: .cfi_offset 19, -8
-; CHECK-LE-NEXT: .cfi_offset 18, -12
-; CHECK-LE-NEXT: .cfi_offset 17, -16
-; CHECK-LE-NEXT: .cfi_offset 16, -20
-; CHECK-LE-NEXT: move $16, $7
-; CHECK-LE-NEXT: move $17, $6
-; CHECK-LE-NEXT: move $18, $5
-; CHECK-LE-NEXT: move $19, $4
-; CHECK-LE-NEXT: lw $1, 60($sp)
-; CHECK-LE-NEXT: andi $5, $1, 31
-; CHECK-LE-NEXT: lw $4, 56($sp)
-; CHECK-LE-NEXT: addiu $6, $zero, 37
-; CHECK-LE-NEXT: jal __umoddi3
-; CHECK-LE-NEXT: addiu $7, $zero, 0
-; CHECK-LE-NEXT: srl $1, $2, 5
-; CHECK-LE-NEXT: andi $3, $1, 1
-; CHECK-LE-NEXT: srl $1, $17, 5
-; CHECK-LE-NEXT: sll $4, $16, 27
-; CHECK-LE-NEXT: or $1, $4, $1
-; CHECK-LE-NEXT: move $4, $19
-; CHECK-LE-NEXT: movn $4, $1, $3
-; CHECK-LE-NEXT: sllv $5, $4, $2
-; CHECK-LE-NEXT: not $6, $2
-; CHECK-LE-NEXT: sll $7, $17, 27
-; CHECK-LE-NEXT: movn $1, $7, $3
-; CHECK-LE-NEXT: srl $1, $1, 1
-; CHECK-LE-NEXT: srlv $1, $1, $6
-; CHECK-LE-NEXT: or $1, $5, $1
-; CHECK-LE-NEXT: movn $18, $19, $3
-; CHECK-LE-NEXT: sllv $2, $18, $2
-; CHECK-LE-NEXT: srl $3, $4, 1
-; CHECK-LE-NEXT: srlv $3, $3, $6
-; CHECK-LE-NEXT: or $3, $2, $3
-; CHECK-LE-NEXT: move $2, $1
-; CHECK-LE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $19, 32($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
+; CHECK-LE-NEXT: lui $1, 1771
+; CHECK-LE-NEXT: ori $1, $1, 15941
+; CHECK-LE-NEXT: lw $2, 16($sp)
+; CHECK-LE-NEXT: multu $2, $1
+; CHECK-LE-NEXT: mfhi $3
+; CHECK-LE-NEXT: mflo $8
+; CHECK-LE-NEXT: lui $9, 12398
+; CHECK-LE-NEXT: ori $9, $9, 46053
+; CHECK-LE-NEXT: multu $2, $9
+; CHECK-LE-NEXT: mfhi $10
+; CHECK-LE-NEXT: lw $11, 20($sp)
+; CHECK-LE-NEXT: andi $11, $11, 31
+; CHECK-LE-NEXT: multu $11, $9
+; CHECK-LE-NEXT: mflo $9
+; CHECK-LE-NEXT: mfhi $12
+; CHECK-LE-NEXT: addu $8, $10, $8
+; CHECK-LE-NEXT: sltu $10, $8, $10
+; CHECK-LE-NEXT: addu $9, $8, $9
+; CHECK-LE-NEXT: sltu $8, $9, $8
+; CHECK-LE-NEXT: addu $3, $3, $10
+; CHECK-LE-NEXT: srl $9, $6, 5
+; CHECK-LE-NEXT: sll $7, $7, 27
+; CHECK-LE-NEXT: or $7, $7, $9
+; CHECK-LE-NEXT: sll $6, $6, 27
+; CHECK-LE-NEXT: addu $3, $3, $12
+; CHECK-LE-NEXT: addu $3, $3, $8
+; CHECK-LE-NEXT: mul $1, $11, $1
+; CHECK-LE-NEXT: addu $1, $3, $1
+; CHECK-LE-NEXT: sll $3, $1, 2
+; CHECK-LE-NEXT: addu $3, $3, $1
+; CHECK-LE-NEXT: sll $1, $1, 5
+; CHECK-LE-NEXT: addu $1, $1, $3
+; CHECK-LE-NEXT: subu $1, $2, $1
+; CHECK-LE-NEXT: andi $2, $1, 32
+; CHECK-LE-NEXT: srl $3, $2, 5
+; CHECK-LE-NEXT: move $8, $4
+; CHECK-LE-NEXT: movn $8, $7, $3
+; CHECK-LE-NEXT: sllv $2, $8, $1
+; CHECK-LE-NEXT: not $9, $1
+; CHECK-LE-NEXT: movn $7, $6, $3
+; CHECK-LE-NEXT: srl $6, $7, 1
+; CHECK-LE-NEXT: srlv $6, $6, $9
+; CHECK-LE-NEXT: or $2, $2, $6
+; CHECK-LE-NEXT: movn $5, $4, $3
+; CHECK-LE-NEXT: sllv $1, $5, $1
+; CHECK-LE-NEXT: srl $3, $8, 1
+; CHECK-LE-NEXT: srlv $3, $3, $9
; CHECK-LE-NEXT: jr $ra
-; CHECK-LE-NEXT: addiu $sp, $sp, 40
+; CHECK-LE-NEXT: or $3, $1, $3
%f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
}
@@ -288,104 +289,106 @@ declare i37 @llvm.fshr.i37(i37, i37, i37)
define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-BE-LABEL: fshr_i37:
; CHECK-BE: # %bb.0:
-; CHECK-BE-NEXT: addiu $sp, $sp, -40
-; CHECK-BE-NEXT: .cfi_def_cfa_offset 40
-; CHECK-BE-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $19, 32($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $18, 28($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: .cfi_offset 31, -4
-; CHECK-BE-NEXT: .cfi_offset 19, -8
-; CHECK-BE-NEXT: .cfi_offset 18, -12
-; CHECK-BE-NEXT: .cfi_offset 17, -16
-; CHECK-BE-NEXT: .cfi_offset 16, -20
-; CHECK-BE-NEXT: move $16, $7
-; CHECK-BE-NEXT: move $17, $6
-; CHECK-BE-NEXT: move $18, $5
-; CHECK-BE-NEXT: move $19, $4
-; CHECK-BE-NEXT: lw $1, 56($sp)
-; CHECK-BE-NEXT: andi $4, $1, 31
-; CHECK-BE-NEXT: lw $5, 60($sp)
-; CHECK-BE-NEXT: addiu $6, $zero, 0
-; CHECK-BE-NEXT: jal __umoddi3
-; CHECK-BE-NEXT: addiu $7, $zero, 37
-; CHECK-BE-NEXT: addiu $1, $3, 27
+; CHECK-BE-NEXT: lui $1, 1771
+; CHECK-BE-NEXT: ori $1, $1, 15941
+; CHECK-BE-NEXT: lw $2, 20($sp)
+; CHECK-BE-NEXT: multu $2, $1
+; CHECK-BE-NEXT: mfhi $3
+; CHECK-BE-NEXT: mflo $8
+; CHECK-BE-NEXT: lui $9, 12398
+; CHECK-BE-NEXT: ori $9, $9, 46053
+; CHECK-BE-NEXT: multu $2, $9
+; CHECK-BE-NEXT: mfhi $10
+; CHECK-BE-NEXT: lw $11, 16($sp)
+; CHECK-BE-NEXT: andi $11, $11, 31
+; CHECK-BE-NEXT: multu $11, $9
+; CHECK-BE-NEXT: mflo $9
+; CHECK-BE-NEXT: mfhi $12
+; CHECK-BE-NEXT: addu $8, $10, $8
+; CHECK-BE-NEXT: sltu $10, $8, $10
+; CHECK-BE-NEXT: addu $9, $8, $9
+; CHECK-BE-NEXT: sltu $8, $9, $8
+; CHECK-BE-NEXT: addu $3, $3, $10
+; CHECK-BE-NEXT: srl $9, $7, 5
+; CHECK-BE-NEXT: sll $6, $6, 27
+; CHECK-BE-NEXT: or $6, $6, $9
+; CHECK-BE-NEXT: sll $7, $7, 27
+; CHECK-BE-NEXT: addu $3, $3, $12
+; CHECK-BE-NEXT: addu $3, $3, $8
+; CHECK-BE-NEXT: mul $1, $11, $1
+; CHECK-BE-NEXT: addu $1, $3, $1
+; CHECK-BE-NEXT: sll $3, $1, 2
+; CHECK-BE-NEXT: addu $3, $3, $1
+; CHECK-BE-NEXT: sll $1, $1, 5
+; CHECK-BE-NEXT: addu $1, $1, $3
+; CHECK-BE-NEXT: subu $1, $2, $1
+; CHECK-BE-NEXT: addiu $1, $1, 27
; CHECK-BE-NEXT: andi $3, $1, 32
-; CHECK-BE-NEXT: srl $2, $16, 5
-; CHECK-BE-NEXT: sll $4, $17, 27
-; CHECK-BE-NEXT: or $4, $4, $2
-; CHECK-BE-NEXT: movz $19, $18, $3
-; CHECK-BE-NEXT: movz $18, $4, $3
-; CHECK-BE-NEXT: srlv $2, $18, $1
-; CHECK-BE-NEXT: not $5, $1
-; CHECK-BE-NEXT: sll $6, $19, 1
-; CHECK-BE-NEXT: sllv $6, $6, $5
-; CHECK-BE-NEXT: sll $7, $16, 27
-; CHECK-BE-NEXT: or $2, $6, $2
-; CHECK-BE-NEXT: movz $4, $7, $3
-; CHECK-BE-NEXT: srlv $1, $4, $1
-; CHECK-BE-NEXT: sll $3, $18, 1
-; CHECK-BE-NEXT: sllv $3, $3, $5
-; CHECK-BE-NEXT: or $3, $3, $1
-; CHECK-BE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $19, 32($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
+; CHECK-BE-NEXT: movz $4, $5, $3
+; CHECK-BE-NEXT: movz $5, $6, $3
+; CHECK-BE-NEXT: srlv $2, $5, $1
+; CHECK-BE-NEXT: not $8, $1
+; CHECK-BE-NEXT: sll $4, $4, 1
+; CHECK-BE-NEXT: sllv $4, $4, $8
+; CHECK-BE-NEXT: or $2, $4, $2
+; CHECK-BE-NEXT: movz $6, $7, $3
+; CHECK-BE-NEXT: srlv $1, $6, $1
+; CHECK-BE-NEXT: sll $3, $5, 1
+; CHECK-BE-NEXT: sllv $3, $3, $8
; CHECK-BE-NEXT: jr $ra
-; CHECK-BE-NEXT: addiu $sp, $sp, 40
+; CHECK-BE-NEXT: or $3, $3, $1
;
; CHECK-LE-LABEL: fshr_i37:
; CHECK-LE: # %bb.0:
-; CHECK-LE-NEXT: addiu $sp, $sp, -40
-; CHECK-LE-NEXT: .cfi_def_cfa_offset 40
-; CHECK-LE-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $19, 32($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $18, 28($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: .cfi_offset 31, -4
-; CHECK-LE-NEXT: .cfi_offset 19, -8
-; CHECK-LE-NEXT: .cfi_offset 18, -12
-; CHECK-LE-NEXT: .cfi_offset 17, -16
-; CHECK-LE-NEXT: .cfi_offset 16, -20
-; CHECK-LE-NEXT: move $16, $7
-; CHECK-LE-NEXT: move $17, $6
-; CHECK-LE-NEXT: move $18, $5
-; CHECK-LE-NEXT: move $19, $4
-; CHECK-LE-NEXT: lw $1, 60($sp)
-; CHECK-LE-NEXT: andi $5, $1, 31
-; CHECK-LE-NEXT: lw $4, 56($sp)
-; CHECK-LE-NEXT: addiu $6, $zero, 37
-; CHECK-LE-NEXT: jal __umoddi3
-; CHECK-LE-NEXT: addiu $7, $zero, 0
-; CHECK-LE-NEXT: addiu $1, $2, 27
+; CHECK-LE-NEXT: lui $1, 1771
+; CHECK-LE-NEXT: ori $1, $1, 15941
+; CHECK-LE-NEXT: lw $2, 16($sp)
+; CHECK-LE-NEXT: multu $2, $1
+; CHECK-LE-NEXT: mfhi $3
+; CHECK-LE-NEXT: mflo $8
+; CHECK-LE-NEXT: lui $9, 12398
+; CHECK-LE-NEXT: ori $9, $9, 46053
+; CHECK-LE-NEXT: multu $2, $9
+; CHECK-LE-NEXT: mfhi $10
+; CHECK-LE-NEXT: lw $11, 20($sp)
+; CHECK-LE-NEXT: andi $11, $11, 31
+; CHECK-LE-NEXT: multu $11, $9
+; CHECK-LE-NEXT: mflo $9
+; CHECK-LE-NEXT: mfhi $12
+; CHECK-LE-NEXT: addu $8, $10, $8
+; CHECK-LE-NEXT: sltu $10, $8, $10
+; CHECK-LE-NEXT: addu $9, $8, $9
+; CHECK-LE-NEXT: sltu $8, $9, $8
+; CHECK-LE-NEXT: addu $3, $3, $10
+; CHECK-LE-NEXT: srl $9, $6, 5
+; CHECK-LE-NEXT: sll $7, $7, 27
+; CHECK-LE-NEXT: or $7, $7, $9
+; CHECK-LE-NEXT: sll $6, $6, 27
+; CHECK-LE-NEXT: addu $3, $3, $12
+; CHECK-LE-NEXT: addu $3, $3, $8
+; CHECK-LE-NEXT: mul $1, $11, $1
+; CHECK-LE-NEXT: addu $1, $3, $1
+; CHECK-LE-NEXT: sll $3, $1, 2
+; CHECK-LE-NEXT: addu $3, $3, $1
+; CHECK-LE-NEXT: sll $1, $1, 5
+; CHECK-LE-NEXT: addu $1, $1, $3
+; CHECK-LE-NEXT: subu $1, $2, $1
+; CHECK-LE-NEXT: addiu $1, $1, 27
; CHECK-LE-NEXT: andi $3, $1, 32
-; CHECK-LE-NEXT: srl $2, $17, 5
-; CHECK-LE-NEXT: sll $4, $16, 27
-; CHECK-LE-NEXT: or $2, $4, $2
-; CHECK-LE-NEXT: sll $4, $17, 27
-; CHECK-LE-NEXT: move $5, $19
-; CHECK-LE-NEXT: movz $5, $2, $3
-; CHECK-LE-NEXT: movz $2, $4, $3
-; CHECK-LE-NEXT: srlv $2, $2, $1
-; CHECK-LE-NEXT: not $4, $1
-; CHECK-LE-NEXT: sll $6, $5, 1
-; CHECK-LE-NEXT: sllv $6, $6, $4
-; CHECK-LE-NEXT: or $2, $6, $2
-; CHECK-LE-NEXT: srlv $1, $5, $1
-; CHECK-LE-NEXT: movz $18, $19, $3
-; CHECK-LE-NEXT: sll $3, $18, 1
-; CHECK-LE-NEXT: sllv $3, $3, $4
-; CHECK-LE-NEXT: or $3, $3, $1
-; CHECK-LE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $19, 32($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
+; CHECK-LE-NEXT: move $8, $4
+; CHECK-LE-NEXT: movz $8, $7, $3
+; CHECK-LE-NEXT: movz $7, $6, $3
+; CHECK-LE-NEXT: srlv $2, $7, $1
+; CHECK-LE-NEXT: not $6, $1
+; CHECK-LE-NEXT: sll $7, $8, 1
+; CHECK-LE-NEXT: sllv $7, $7, $6
+; CHECK-LE-NEXT: or $2, $7, $2
+; CHECK-LE-NEXT: srlv $1, $8, $1
+; CHECK-LE-NEXT: movz $5, $4, $3
+; CHECK-LE-NEXT: sll $3, $5, 1
+; CHECK-LE-NEXT: sllv $3, $3, $6
; CHECK-LE-NEXT: jr $ra
-; CHECK-LE-NEXT: addiu $sp, $sp, 40
+; CHECK-LE-NEXT: or $3, $3, $1
%f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
}
diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift.ll b/llvm/test/CodeGen/PowerPC/funnel-shift.ll
index be95233656f47..952fede1d9b8d 100644
--- a/llvm/test/CodeGen/PowerPC/funnel-shift.ll
+++ b/llvm/test/CodeGen/PowerPC/funnel-shift.ll
@@ -270,116 +270,94 @@ declare i37 @llvm.fshl.i37(i37, i37, i37)
define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
; CHECK32_32-LABEL: fshl_i37:
; CHECK32_32: # %bb.0:
-; CHECK32_32-NEXT: mflr 0
-; CHECK32_32-NEXT: stwu 1, -32(1)
-; CHECK32_32-NEXT: stw 0, 36(1)
-; CHECK32_32-NEXT: .cfi_def_cfa_offset 32
-; CHECK32_32-NEXT: .cfi_offset lr, 4
-; CHECK32_32-NEXT: .cfi_offset r27, -20
-; CHECK32_32-NEXT: .cfi_offset r28, -16
-; CHECK32_32-NEXT: .cfi_offset r29, -12
-; CHECK32_32-NEXT: .cfi_offset r30, -8
-; CHECK32_32-NEXT: stw 27, 12(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 27, 5
-; CHECK32_32-NEXT: stw 28, 16(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 28, 3
-; CHECK32_32-NEXT: stw 29, 20(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 29, 4
-; CHECK32_32-NEXT: stw 30, 24(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 30, 6
-; CHECK32_32-NEXT: clrlwi 3, 7, 27
-; CHECK32_32-NEXT: mr 4, 8
-; CHECK32_32-NEXT: li 5, 0
-; CHECK32_32-NEXT: li 6, 37
-; CHECK32_32-NEXT: bl __umoddi3
-; CHECK32_32-NEXT: rotlwi 5, 30, 27
-; CHECK32_32-NEXT: rlwimi 5, 27, 27, 0, 4
-; CHECK32_32-NEXT: andi. 3, 4, 32
-; CHECK32_32-NEXT: mr 6, 5
+; CHECK32_32-NEXT: lis 9, 1771
+; CHECK32_32-NEXT: lis 11, 12398
+; CHECK32_32-NEXT: ori 9, 9, 15941
+; CHECK32_32-NEXT: clrlwi 7, 7, 27
+; CHECK32_32-NEXT: ori 11, 11, 46053
+; CHECK32_32-NEXT: mulhwu 10, 8, 9
+; CHECK32_32-NEXT: mulhwu 12, 7, 11
+; CHECK32_32-NEXT: mullw 0, 8, 9
+; CHECK32_32-NEXT: mullw 9, 7, 9
+; CHECK32_32-NEXT: mullw 7, 7, 11
+; CHECK32_32-NEXT: mulhwu 11, 8, 11
+; CHECK32_32-NEXT: addc 11, 11, 0
+; CHECK32_32-NEXT: addze 10, 10
+; CHECK32_32-NEXT: addc 7, 11, 7
+; CHECK32_32-NEXT: adde 7, 10, 12
+; CHECK32_32-NEXT: add 7, 7, 9
+; CHECK32_32-NEXT: mulli 7, 7, 37
+; CHECK32_32-NEXT: sub 8, 8, 7
+; CHECK32_32-NEXT: andi. 7, 8, 32
+; CHECK32_32-NEXT: rotlwi 7, 6, 27
+; CHECK32_32-NEXT: rlwimi 7, 5, 27, 0, 4
+; CHECK32_32-NEXT: mr 5, 7
; CHECK32_32-NEXT: bne 0, .LBB3_2
; CHECK32_32-NEXT: # %bb.1:
-; CHECK32_32-NEXT: mr 6, 29
+; CHECK32_32-NEXT: mr 5, 4
; CHECK32_32-NEXT: .LBB3_2:
-; CHECK32_32-NEXT: clrlwi 4, 4, 27
-; CHECK32_32-NEXT: subfic 7, 4, 32
-; CHECK32_32-NEXT: srw 3, 6, 7
+; CHECK32_32-NEXT: clrlwi 8, 8, 27
+; CHECK32_32-NEXT: subfic 9, 8, 32
+; CHECK32_32-NEXT: srw 10, 5, 9
; CHECK32_32-NEXT: bne 0, .LBB3_4
; CHECK32_32-NEXT: # %bb.3:
-; CHECK32_32-NEXT: mr 29, 28
+; CHECK32_32-NEXT: mr 4, 3
; CHECK32_32-NEXT: .LBB3_4:
-; CHECK32_32-NEXT: slw 8, 29, 4
-; CHECK32_32-NEXT: or 3, 8, 3
+; CHECK32_32-NEXT: slw 3, 4, 8
+; CHECK32_32-NEXT: or 3, 3, 10
; CHECK32_32-NEXT: beq 0, .LBB3_6
; CHECK32_32-NEXT: # %bb.5:
-; CHECK32_32-NEXT: slwi 5, 30, 27
+; CHECK32_32-NEXT: slwi 7, 6, 27
; CHECK32_32-NEXT: .LBB3_6:
-; CHECK32_32-NEXT: srw 5, 5, 7
-; CHECK32_32-NEXT: slw 4, 6, 4
-; CHECK32_32-NEXT: or 4, 4, 5
-; CHECK32_32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 28, 16(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 27, 12(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 0, 36(1)
-; CHECK32_32-NEXT: addi 1, 1, 32
-; CHECK32_32-NEXT: mtlr 0
+; CHECK32_32-NEXT: srw 4, 7, 9
+; CHECK32_32-NEXT: slw 5, 5, 8
+; CHECK32_32-NEXT: or 4, 5, 4
; CHECK32_32-NEXT: blr
;
; CHECK32_64-LABEL: fshl_i37:
; CHECK32_64: # %bb.0:
-; CHECK32_64-NEXT: mflr 0
-; CHECK32_64-NEXT: stwu 1, -32(1)
-; CHECK32_64-NEXT: stw 0, 36(1)
-; CHECK32_64-NEXT: .cfi_def_cfa_offset 32
-; CHECK32_64-NEXT: .cfi_offset lr, 4
-; CHECK32_64-NEXT: .cfi_offset r27, -20
-; CHECK32_64-NEXT: .cfi_offset r28, -16
-; CHECK32_64-NEXT: .cfi_offset r29, -12
-; CHECK32_64-NEXT: .cfi_offset r30, -8
-; CHECK32_64-NEXT: stw 27, 12(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 27, 5
-; CHECK32_64-NEXT: li 5, 0
-; CHECK32_64-NEXT: stw 28, 16(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 28, 3
-; CHECK32_64-NEXT: clrlwi 3, 7, 27
-; CHECK32_64-NEXT: stw 29, 20(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 29, 4
-; CHECK32_64-NEXT: mr 4, 8
-; CHECK32_64-NEXT: stw 30, 24(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 30, 6
-; CHECK32_64-NEXT: li 6, 37
-; CHECK32_64-NEXT: bl __umoddi3
-; CHECK32_64-NEXT: rotlwi 5, 30, 27
-; CHECK32_64-NEXT: andi. 3, 4, 32
-; CHECK32_64-NEXT: rlwimi 5, 27, 27, 0, 4
-; CHECK32_64-NEXT: mr 6, 5
+; CHECK32_64-NEXT: lis 9, 1771
+; CHECK32_64-NEXT: lis 12, 12398
+; CHECK32_64-NEXT: ori 9, 9, 15941
+; CHECK32_64-NEXT: clrlwi 7, 7, 27
+; CHECK32_64-NEXT: ori 12, 12, 46053
+; CHECK32_64-NEXT: mulhwu 10, 8, 9
+; CHECK32_64-NEXT: mullw 11, 8, 9
+; CHECK32_64-NEXT: mulhwu 0, 7, 12
+; CHECK32_64-NEXT: mullw 9, 7, 9
+; CHECK32_64-NEXT: mullw 7, 7, 12
+; CHECK32_64-NEXT: mulhwu 12, 8, 12
+; CHECK32_64-NEXT: addc 11, 12, 11
+; CHECK32_64-NEXT: addze 10, 10
+; CHECK32_64-NEXT: addc 7, 11, 7
+; CHECK32_64-NEXT: adde 7, 10, 0
+; CHECK32_64-NEXT: add 7, 7, 9
+; CHECK32_64-NEXT: mulli 7, 7, 37
+; CHECK32_64-NEXT: sub 8, 8, 7
+; CHECK32_64-NEXT: andi. 7, 8, 32
+; CHECK32_64-NEXT: rotlwi 7, 6, 27
+; CHECK32_64-NEXT: rlwimi 7, 5, 27, 0, 4
+; CHECK32_64-NEXT: mr 5, 7
; CHECK32_64-NEXT: bne 0, .LBB3_2
; CHECK32_64-NEXT: # %bb.1:
-; CHECK32_64-NEXT: mr 6, 29
+; CHECK32_64-NEXT: mr 5, 4
; CHECK32_64-NEXT: .LBB3_2:
-; CHECK32_64-NEXT: clrlwi 4, 4, 27
-; CHECK32_64-NEXT: subfic 7, 4, 32
-; CHECK32_64-NEXT: srw 3, 6, 7
+; CHECK32_64-NEXT: clrlwi 8, 8, 27
+; CHECK32_64-NEXT: subfic 9, 8, 32
+; CHECK32_64-NEXT: srw 10, 5, 9
; CHECK32_64-NEXT: bne 0, .LBB3_4
; CHECK32_64-NEXT: # %bb.3:
-; CHECK32_64-NEXT: mr 29, 28
+; CHECK32_64-NEXT: mr 4, 3
; CHECK32_64-NEXT: .LBB3_4:
-; CHECK32_64-NEXT: slw 8, 29, 4
-; CHECK32_64-NEXT: or 3, 8, 3
+; CHECK32_64-NEXT: slw 3, 4, 8
+; CHECK32_64-NEXT: or 3, 3, 10
; CHECK32_64-NEXT: beq 0, .LBB3_6
; CHECK32_64-NEXT: # %bb.5:
-; CHECK32_64-NEXT: slwi 5, 30, 27
+; CHECK32_64-NEXT: slwi 7, 6, 27
; CHECK32_64-NEXT: .LBB3_6:
-; CHECK32_64-NEXT: srw 5, 5, 7
-; CHECK32_64-NEXT: slw 4, 6, 4
-; CHECK32_64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: or 4, 4, 5
-; CHECK32_64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: lwz 28, 16(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: lwz 27, 12(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: lwz 0, 36(1)
-; CHECK32_64-NEXT: addi 1, 1, 32
-; CHECK32_64-NEXT: mtlr 0
+; CHECK32_64-NEXT: srw 4, 7, 9
+; CHECK32_64-NEXT: slw 5, 5, 8
+; CHECK32_64-NEXT: or 4, 5, 4
; CHECK32_64-NEXT: blr
;
; CHECK64-LABEL: fshl_i37:
@@ -536,118 +514,96 @@ declare i37 @llvm.fshr.i37(i37, i37, i37)
define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
; CHECK32_32-LABEL: fshr_i37:
; CHECK32_32: # %bb.0:
-; CHECK32_32-NEXT: mflr 0
-; CHECK32_32-NEXT: stwu 1, -32(1)
-; CHECK32_32-NEXT: stw 0, 36(1)
-; CHECK32_32-NEXT: .cfi_def_cfa_offset 32
-; CHECK32_32-NEXT: .cfi_offset lr, 4
-; CHECK32_32-NEXT: .cfi_offset r27, -20
-; CHECK32_32-NEXT: .cfi_offset r28, -16
-; CHECK32_32-NEXT: .cfi_offset r29, -12
-; CHECK32_32-NEXT: .cfi_offset r30, -8
-; CHECK32_32-NEXT: stw 27, 12(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 27, 5
-; CHECK32_32-NEXT: stw 28, 16(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 28, 3
-; CHECK32_32-NEXT: stw 29, 20(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 29, 4
-; CHECK32_32-NEXT: stw 30, 24(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 30, 6
-; CHECK32_32-NEXT: clrlwi 3, 7, 27
-; CHECK32_32-NEXT: mr 4, 8
-; CHECK32_32-NEXT: li 5, 0
-; CHECK32_32-NEXT: li 6, 37
-; CHECK32_32-NEXT: bl __umoddi3
-; CHECK32_32-NEXT: rotlwi 5, 30, 27
-; CHECK32_32-NEXT: addi 3, 4, 27
-; CHECK32_32-NEXT: andi. 4, 3, 32
-; CHECK32_32-NEXT: rlwimi 5, 27, 27, 0, 4
-; CHECK32_32-NEXT: mr 4, 5
+; CHECK32_32-NEXT: lis 9, 1771
+; CHECK32_32-NEXT: lis 11, 12398
+; CHECK32_32-NEXT: ori 9, 9, 15941
+; CHECK32_32-NEXT: clrlwi 7, 7, 27
+; CHECK32_32-NEXT: ori 11, 11, 46053
+; CHECK32_32-NEXT: mulhwu 10, 8, 9
+; CHECK32_32-NEXT: mulhwu 12, 7, 11
+; CHECK32_32-NEXT: mullw 0, 8, 9
+; CHECK32_32-NEXT: mullw 9, 7, 9
+; CHECK32_32-NEXT: mullw 7, 7, 11
+; CHECK32_32-NEXT: mulhwu 11, 8, 11
+; CHECK32_32-NEXT: addc 11, 11, 0
+; CHECK32_32-NEXT: addze 10, 10
+; CHECK32_32-NEXT: addc 7, 11, 7
+; CHECK32_32-NEXT: adde 7, 10, 12
+; CHECK32_32-NEXT: add 7, 7, 9
+; CHECK32_32-NEXT: mulli 7, 7, 37
+; CHECK32_32-NEXT: sub 7, 8, 7
+; CHECK32_32-NEXT: addi 8, 7, 27
+; CHECK32_32-NEXT: andi. 7, 8, 32
+; CHECK32_32-NEXT: rotlwi 7, 6, 27
+; CHECK32_32-NEXT: rlwimi 7, 5, 27, 0, 4
+; CHECK32_32-NEXT: mr 5, 7
; CHECK32_32-NEXT: beq 0, .LBB11_2
; CHECK32_32-NEXT: # %bb.1:
-; CHECK32_32-NEXT: mr 4, 29
+; CHECK32_32-NEXT: mr 5, 4
; CHECK32_32-NEXT: .LBB11_2:
-; CHECK32_32-NEXT: clrlwi 6, 3, 27
-; CHECK32_32-NEXT: srw 3, 4, 6
+; CHECK32_32-NEXT: clrlwi 8, 8, 27
+; CHECK32_32-NEXT: srw 10, 5, 8
; CHECK32_32-NEXT: beq 0, .LBB11_4
; CHECK32_32-NEXT: # %bb.3:
-; CHECK32_32-NEXT: mr 29, 28
+; CHECK32_32-NEXT: mr 4, 3
; CHECK32_32-NEXT: .LBB11_4:
-; CHECK32_32-NEXT: subfic 7, 6, 32
-; CHECK32_32-NEXT: slw 8, 29, 7
-; CHECK32_32-NEXT: or 3, 8, 3
+; CHECK32_32-NEXT: subfic 9, 8, 32
+; CHECK32_32-NEXT: slw 3, 4, 9
+; CHECK32_32-NEXT: or 3, 3, 10
; CHECK32_32-NEXT: bne 0, .LBB11_6
; CHECK32_32-NEXT: # %bb.5:
-; CHECK32_32-NEXT: slwi 5, 30, 27
+; CHECK32_32-NEXT: slwi 7, 6, 27
; CHECK32_32-NEXT: .LBB11_6:
-; CHECK32_32-NEXT: srw 5, 5, 6
-; CHECK32_32-NEXT: slw 4, 4, 7
-; CHECK32_32-NEXT: or 4, 4, 5
-; CHECK32_32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 28, 16(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 27, 12(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 0, 36(1)
-; CHECK32_32-NEXT: addi 1, 1, 32
-; CHECK32_32-NEXT: mtlr 0
+; CHECK32_32-NEXT: srw 4, 7, 8
+; CHECK32_32-NEXT: slw 5, 5, 9
+; CHECK32_32-NEXT: or 4, 5, 4
; CHECK32_32-NEXT: blr
;
; CHECK32_64-LABEL: fshr_i37:
; CHECK32_64: # %bb.0:
-; CHECK32_64-NEXT: mflr 0
-; CHECK32_64-NEXT: stwu 1, -32(1)
-; CHECK32_64-NEXT: stw 0, 36(1)
-; CHECK32_64-NEXT: .cfi_def_cfa_offset 32
-; CHECK32_64-NEXT: .cfi_offset lr, 4
-; CHECK32_64-NEXT: .cfi_offset r27, -20
-; CHECK32_64-NEXT: .cfi_offset r28, -16
-; CHECK32_64-NEXT: .cfi_offset r29, -12
-; CHECK32_64-NEXT: .cfi_offset r30, -8
-; CHECK32_64-NEXT: stw 27, 12(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 27, 5
-; CHECK32_64-NEXT: li 5, 0
-; CHECK32_64-NEXT: stw 28, 16(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 28, 3
-; CHECK32_64-NEXT: clrlwi 3, 7, 27
-; CHECK32_64-NEXT: stw 29, 20(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 29, 4
-; CHECK32_64-NEXT: mr 4, 8
-; CHECK32_64-NEXT: stw 30, 24(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 30, 6
-; CHECK32_64-NEXT: li 6, 37
-; CHECK32_64-NEXT: bl __umoddi3
-; CHECK32_64-NEXT: rotlwi 5, 30, 27
-; CHECK32_64-NEXT: addi 3, 4, 27
-; CHECK32_64-NEXT: andi. 4, 3, 32
-; CHECK32_64-NEXT: rlwimi 5, 27, 27, 0, 4
-; CHECK32_64-NEXT: mr 4, 5
+; CHECK32_64-NEXT: lis 9, 1771
+; CHECK32_64-NEXT: lis 12, 12398
+; CHECK32_64-NEXT: ori 9, 9, 15941
+; CHECK32_64-NEXT: clrlwi 7, 7, 27
+; CHECK32_64-NEXT: ori 12, 12, 46053
+; CHECK32_64-NEXT: mulhwu 10, 8, 9
+; CHECK32_64-NEXT: mullw 11, 8, 9
+; CHECK32_64-NEXT: mulhwu 0, 7, 12
+; CHECK32_64-NEXT: mullw 9, 7, 9
+; CHECK32_64-NEXT: mullw 7, 7, 12
+; CHECK32_64-NEXT: mulhwu 12, 8, 12
+; CHECK32_64-NEXT: addc 11, 12, 11
+; CHECK32_64-NEXT: addze 10, 10
+; CHECK32_64-NEXT: addc 7, 11, 7
+; CHECK32_64-NEXT: adde 7, 10, 0
+; CHECK32_64-NEXT: add 7, 7, 9
+; CHECK32_64-NEXT: mulli 7, 7, 37
+; CHECK32_64-NEXT: sub 7, 8, 7
+; CHECK32_64-NEXT: addi 8, 7, 27
+; CHECK32_64-NEXT: andi. 7, 8, 32
+; CHECK32_64-NEXT: rotlwi 7, 6, 27
+; CHECK32_64-NEXT: rlwimi 7, 5, 27, 0, 4
+; CHECK32_64-NEXT: mr 5, 7
; CHECK32_64-NEXT: beq 0, .LBB11_2
; CHECK32_64-NEXT: # %bb.1:
-; CHECK32_64-NEXT: mr 4, 29
+; CHECK32_64-NEXT: mr 5, 4
; CHECK32_64-NEXT: .LBB11_2:
-; CHECK32_64-NEXT: clrlwi 6, 3, 27
-; CHECK32_64-NEXT: srw 3, 4, 6
+; CHECK32_64-NEXT: clrlwi 8, 8, 27
+; CHECK32_64-NEXT: srw 10, 5, 8
; CHECK32_64-NEXT: beq 0, .LBB11_4
; CHECK32_64-NEXT: # %bb.3:
-; CHECK32_64-NEXT: mr 29, 28
+; CHECK32_64-NEXT: mr 4, 3
; CHECK32_64-NEXT: .LBB11_4:
-; CHECK32_64-NEXT: subfic 7, 6, 32
-; CHECK32_64-NEXT: slw 8, 29, 7
-; CHECK32_64-NEXT: or 3, 8, 3
+; CHECK32_64-NEXT: subfic 9, 8, 32
+; CHECK32_64-NEXT: slw 3, 4, 9
+; CHECK32_64-NEXT: or 3, 3, 10
; CHECK32_64-NEXT: bne 0, .LBB11_6
; CHECK32_64-NEXT: # %bb.5:
-; CHECK32_64-NEXT: slwi 5, 30, 27
+; CHECK32_64-NEXT: slwi 7, 6, 27
; CHECK32_64-NEXT: .LBB11_6:
-; CHECK32_64-NEXT: srw 5, 5, 6
-; CHECK32_64-NEXT: slw 4, 4, 7
-; CHECK32_64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: or 4, 4, 5
-; CHECK32_64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: lwz 28, 16(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: lwz 27, 12(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: lwz 0, 36(1)
-; CHECK32_64-NEXT: addi 1, 1, 32
-; CHECK32_64-NEXT: mtlr 0
+; CHECK32_64-NEXT: srw 4, 7, 8
+; CHECK32_64-NEXT: slw 5, 5, 9
+; CHECK32_64-NEXT: or 4, 5, 4
; CHECK32_64-NEXT: blr
;
; CHECK64-LABEL: fshr_i37:
diff --git a/llvm/test/CodeGen/PowerPC/urem-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-lkk.ll
index 03fd0c0c7e8e2..c6e623938d819 100644
--- a/llvm/test/CodeGen/PowerPC/urem-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-lkk.ll
@@ -86,23 +86,85 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) {
}
define i64 @fold_urem_i64(i64 %x) {
-; CHECK-LABEL: fold_urem_i64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: mflr 0
-; CHECK-NEXT: stwu 1, -16(1)
-; CHECK-NEXT: stw 0, 20(1)
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset lr, 4
-; CHECK-NEXT: li 5, 0
-; CHECK-NEXT: li 6, 98
-; CHECK-NEXT: bl __umoddi3
-; CHECK-NEXT: lwz 0, 20(1)
-; CHECK-NEXT: addi 1, 1, 16
-; CHECK-NEXT: mtlr 0
-; CHECK-NEXT: blr
+; PPC32-LABEL: fold_urem_i64:
+; PPC32: # %bb.0:
+; PPC32-NEXT: lis 5, 21399
+; PPC32-NEXT: lis 8, -17388
+; PPC32-NEXT: rotlwi 11, 4, 31
+; PPC32-NEXT: ori 5, 5, 33436
+; PPC32-NEXT: srwi 6, 3, 1
+; PPC32-NEXT: ori 8, 8, 58849
+; PPC32-NEXT: rlwimi 11, 3, 31, 0, 0
+; PPC32-NEXT: mulhwu 7, 6, 5
+; PPC32-NEXT: mulhwu 9, 6, 8
+; PPC32-NEXT: mullw 10, 6, 8
+; PPC32-NEXT: mullw 6, 6, 5
+; PPC32-NEXT: mulhwu 12, 11, 5
+; PPC32-NEXT: mullw 5, 11, 5
+; PPC32-NEXT: mulhwu 8, 11, 8
+; PPC32-NEXT: addc 5, 8, 5
+; PPC32-NEXT: addze 11, 12
+; PPC32-NEXT: addc 5, 5, 10
+; PPC32-NEXT: adde 5, 11, 9
+; PPC32-NEXT: addze 7, 7
+; PPC32-NEXT: addc 5, 5, 6
+; PPC32-NEXT: addze 6, 7
+; PPC32-NEXT: rotlwi 5, 5, 28
+; PPC32-NEXT: li 8, 98
+; PPC32-NEXT: rlwimi 5, 6, 28, 0, 3
+; PPC32-NEXT: mulhwu 7, 5, 8
+; PPC32-NEXT: mulli 5, 5, 98
+; PPC32-NEXT: subc 4, 4, 5
+; PPC32-NEXT: li 5, 0
+; PPC32-NEXT: srwi 6, 6, 4
+; PPC32-NEXT: addze 5, 5
+; PPC32-NEXT: mulli 6, 6, 98
+; PPC32-NEXT: cntlzw 5, 5
+; PPC32-NEXT: rlwinm 5, 5, 27, 31, 31
+; PPC32-NEXT: add 6, 7, 6
+; PPC32-NEXT: sub 3, 3, 5
+; PPC32-NEXT: sub 3, 3, 6
+; PPC32-NEXT: blr
+;
+; PPC64-LABEL: fold_urem_i64:
+; PPC64: # %bb.0:
+; PPC64-NEXT: lis 5, 21399
+; PPC64-NEXT: lis 8, -17388
+; PPC64-NEXT: rotlwi 10, 4, 31
+; PPC64-NEXT: ori 5, 5, 33436
+; PPC64-NEXT: srwi 6, 3, 1
+; PPC64-NEXT: ori 8, 8, 58849
+; PPC64-NEXT: rlwimi 10, 3, 31, 0, 0
+; PPC64-NEXT: mulhwu 7, 6, 5
+; PPC64-NEXT: mulhwu 9, 6, 8
+; PPC64-NEXT: mulhwu 11, 10, 5
+; PPC64-NEXT: mullw 12, 6, 8
+; PPC64-NEXT: mullw 6, 6, 5
+; PPC64-NEXT: mullw 5, 10, 5
+; PPC64-NEXT: mulhwu 8, 10, 8
+; PPC64-NEXT: addc 5, 8, 5
+; PPC64-NEXT: addze 10, 11
+; PPC64-NEXT: addc 5, 5, 12
+; PPC64-NEXT: adde 5, 10, 9
+; PPC64-NEXT: addze 7, 7
+; PPC64-NEXT: addc 5, 5, 6
+; PPC64-NEXT: addze 6, 7
+; PPC64-NEXT: rotlwi 5, 5, 28
+; PPC64-NEXT: li 8, 98
+; PPC64-NEXT: rlwimi 5, 6, 28, 0, 3
+; PPC64-NEXT: mulhwu 7, 5, 8
+; PPC64-NEXT: srwi 6, 6, 4
+; PPC64-NEXT: mulli 5, 5, 98
+; PPC64-NEXT: subc 4, 4, 5
+; PPC64-NEXT: li 5, 0
+; PPC64-NEXT: addze 5, 5
+; PPC64-NEXT: cntlzw 5, 5
+; PPC64-NEXT: mulli 6, 6, 98
+; PPC64-NEXT: rlwinm 5, 5, 27, 31, 31
+; PPC64-NEXT: add 6, 7, 6
+; PPC64-NEXT: sub 3, 3, 5
+; PPC64-NEXT: sub 3, 3, 6
+; PPC64-NEXT: blr
%1 = urem i64 %x, 98
ret i64 %1
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; PPC32: {{.*}}
-; PPC64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 53c3f5841ba0f..6a04dcd7fb069 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -117,13 +117,48 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind {
define i64 @udiv64_constant_add(i64 %a) nounwind {
; RV32-LABEL: udiv64_constant_add:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: li a2, 7
-; RV32-NEXT: li a3, 0
-; RV32-NEXT: call __udivdi3
-; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: lui a2, 149797
+; RV32-NEXT: lui a3, 599186
+; RV32-NEXT: addi a2, a2, -1756
+; RV32-NEXT: addi a3, a3, 1171
+; RV32-NEXT: mul a4, a0, a2
+; RV32-NEXT: mulhu a5, a0, a3
+; RV32-NEXT: mul a6, a1, a3
+; RV32-NEXT: mulhu a7, a0, a2
+; RV32-NEXT: mulhu a3, a1, a3
+; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: add a6, a4, a6
+; RV32-NEXT: sltu a5, a4, a5
+; RV32-NEXT: sltu a4, a6, a4
+; RV32-NEXT: mul a6, a1, a2
+; RV32-NEXT: mulhu a2, a1, a2
+; RV32-NEXT: add a5, a7, a5
+; RV32-NEXT: add a3, a5, a3
+; RV32-NEXT: add a7, a3, a4
+; RV32-NEXT: sltu a3, a3, a5
+; RV32-NEXT: seqz a5, a7
+; RV32-NEXT: add a6, a7, a6
+; RV32-NEXT: and a4, a5, a4
+; RV32-NEXT: sltu a5, a6, a7
+; RV32-NEXT: sub a7, a0, a6
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: add a2, a5, a2
+; RV32-NEXT: sltu a0, a0, a7
+; RV32-NEXT: srli a4, a7, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: sub a1, a1, a0
+; RV32-NEXT: sub a1, a1, a2
+; RV32-NEXT: slli a0, a1, 31
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: or a0, a0, a4
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a6, a0, a6
+; RV32-NEXT: sltu a0, a6, a0
+; RV32-NEXT: srli a2, a6, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a1, 30
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a1, a1, 2
; RV32-NEXT: ret
;
; RV64-LABEL: udiv64_constant_add:
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index eb70d7f43c0ef..96250a9c88240 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -117,24 +117,94 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
; RV32-LABEL: test_udiv_7:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: li a2, 7
-; RV32-NEXT: li a3, 0
-; RV32-NEXT: call __udivdi3
-; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: lui a2, 149797
+; RV32-NEXT: lui a3, 599186
+; RV32-NEXT: addi a2, a2, -1756
+; RV32-NEXT: addi a3, a3, 1171
+; RV32-NEXT: mul a4, a0, a2
+; RV32-NEXT: mulhu a5, a0, a3
+; RV32-NEXT: mul a6, a1, a3
+; RV32-NEXT: mulhu a7, a0, a2
+; RV32-NEXT: mulhu a3, a1, a3
+; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: add a6, a4, a6
+; RV32-NEXT: sltu a5, a4, a5
+; RV32-NEXT: sltu a4, a6, a4
+; RV32-NEXT: mul a6, a1, a2
+; RV32-NEXT: mulhu a2, a1, a2
+; RV32-NEXT: add a5, a7, a5
+; RV32-NEXT: add a3, a5, a3
+; RV32-NEXT: add a7, a3, a4
+; RV32-NEXT: sltu a3, a3, a5
+; RV32-NEXT: seqz a5, a7
+; RV32-NEXT: add a6, a7, a6
+; RV32-NEXT: and a4, a5, a4
+; RV32-NEXT: sltu a5, a6, a7
+; RV32-NEXT: sub a7, a0, a6
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: add a2, a5, a2
+; RV32-NEXT: sltu a0, a0, a7
+; RV32-NEXT: srli a4, a7, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: sub a1, a1, a0
+; RV32-NEXT: sub a1, a1, a2
+; RV32-NEXT: slli a0, a1, 31
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: or a0, a0, a4
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a6, a0, a6
+; RV32-NEXT: sltu a0, a6, a0
+; RV32-NEXT: srli a2, a6, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a1, 30
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a1, a1, 2
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_7:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: li a2, 7
-; RV64-NEXT: li a3, 0
-; RV64-NEXT: call __udivti3
-; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: lui a2, %hi(.LCPI2_0)
+; RV64-NEXT: lui a3, %hi(.LCPI2_1)
+; RV64-NEXT: ld a2, %lo(.LCPI2_0)(a2)
+; RV64-NEXT: ld a3, %lo(.LCPI2_1)(a3)
+; RV64-NEXT: mul a4, a0, a2
+; RV64-NEXT: mulhu a5, a0, a3
+; RV64-NEXT: mul a6, a1, a3
+; RV64-NEXT: mulhu a7, a0, a2
+; RV64-NEXT: mulhu a3, a1, a3
+; RV64-NEXT: add a4, a5, a4
+; RV64-NEXT: add a6, a4, a6
+; RV64-NEXT: sltu a5, a4, a5
+; RV64-NEXT: sltu a4, a6, a4
+; RV64-NEXT: mul a6, a1, a2
+; RV64-NEXT: mulhu a2, a1, a2
+; RV64-NEXT: add a5, a7, a5
+; RV64-NEXT: add a3, a5, a3
+; RV64-NEXT: add a7, a3, a4
+; RV64-NEXT: sltu a3, a3, a5
+; RV64-NEXT: seqz a5, a7
+; RV64-NEXT: add a6, a7, a6
+; RV64-NEXT: and a4, a5, a4
+; RV64-NEXT: sltu a5, a6, a7
+; RV64-NEXT: sub a7, a0, a6
+; RV64-NEXT: or a3, a3, a4
+; RV64-NEXT: add a2, a5, a2
+; RV64-NEXT: sltu a0, a0, a7
+; RV64-NEXT: srli a4, a7, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: sub a1, a1, a0
+; RV64-NEXT: sub a1, a1, a2
+; RV64-NEXT: slli a0, a1, 63
+; RV64-NEXT: srli a1, a1, 1
+; RV64-NEXT: or a0, a0, a4
+; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: add a6, a0, a6
+; RV64-NEXT: sltu a0, a6, a0
+; RV64-NEXT: srli a2, a6, 2
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a1, 62
+; RV64-NEXT: or a0, a0, a2
+; RV64-NEXT: srli a1, a1, 2
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 7
ret iXLen2 %a
@@ -143,24 +213,70 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
define iXLen2 @test_udiv_9(iXLen2 %x) nounwind {
; RV32-LABEL: test_udiv_9:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: li a2, 9
-; RV32-NEXT: li a3, 0
-; RV32-NEXT: call __udivdi3
-; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: lui a2, 932068
+; RV32-NEXT: lui a3, 582542
+; RV32-NEXT: addi a2, a2, -1821
+; RV32-NEXT: addi a3, a3, 911
+; RV32-NEXT: mul a4, a0, a2
+; RV32-NEXT: mulhu a5, a0, a3
+; RV32-NEXT: mul a6, a1, a3
+; RV32-NEXT: mulhu a0, a0, a2
+; RV32-NEXT: mulhu a3, a1, a3
+; RV32-NEXT: mul a7, a1, a2
+; RV32-NEXT: mulhu a1, a1, a2
+; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: add a6, a4, a6
+; RV32-NEXT: sltu a2, a4, a5
+; RV32-NEXT: sltu a4, a6, a4
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a3, a0, a3
+; RV32-NEXT: add a2, a3, a4
+; RV32-NEXT: sltu a0, a3, a0
+; RV32-NEXT: seqz a3, a2
+; RV32-NEXT: add a7, a2, a7
+; RV32-NEXT: and a3, a3, a4
+; RV32-NEXT: sltu a2, a7, a2
+; RV32-NEXT: srli a4, a7, 3
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a1, 29
+; RV32-NEXT: or a0, a0, a4
+; RV32-NEXT: srli a1, a1, 3
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_9:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: li a2, 9
-; RV64-NEXT: li a3, 0
-; RV64-NEXT: call __udivti3
-; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: lui a2, %hi(.LCPI3_0)
+; RV64-NEXT: lui a3, %hi(.LCPI3_1)
+; RV64-NEXT: ld a2, %lo(.LCPI3_0)(a2)
+; RV64-NEXT: ld a3, %lo(.LCPI3_1)(a3)
+; RV64-NEXT: mul a4, a0, a2
+; RV64-NEXT: mulhu a5, a0, a3
+; RV64-NEXT: mul a6, a1, a3
+; RV64-NEXT: mulhu a0, a0, a2
+; RV64-NEXT: mulhu a3, a1, a3
+; RV64-NEXT: mul a7, a1, a2
+; RV64-NEXT: mulhu a1, a1, a2
+; RV64-NEXT: add a4, a5, a4
+; RV64-NEXT: add a6, a4, a6
+; RV64-NEXT: sltu a2, a4, a5
+; RV64-NEXT: sltu a4, a6, a4
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a3, a0, a3
+; RV64-NEXT: add a2, a3, a4
+; RV64-NEXT: sltu a0, a3, a0
+; RV64-NEXT: seqz a3, a2
+; RV64-NEXT: add a7, a2, a7
+; RV64-NEXT: and a3, a3, a4
+; RV64-NEXT: sltu a2, a7, a2
+; RV64-NEXT: srli a4, a7, 1
+; RV64-NEXT: or a0, a0, a3
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: add a1, a1, a0
+; RV64-NEXT: slli a0, a1, 63
+; RV64-NEXT: or a0, a0, a4
+; RV64-NEXT: srli a1, a1, 1
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 9
ret iXLen2 %a
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index bc4a99a00ac64..3f84e446166aa 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -79,24 +79,118 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
; RV32-LABEL: test_urem_7:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: li a2, 7
-; RV32-NEXT: li a3, 0
-; RV32-NEXT: call __umoddi3
-; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: lui a2, 149797
+; RV32-NEXT: lui a3, 599186
+; RV32-NEXT: li a4, 7
+; RV32-NEXT: addi a2, a2, -1756
+; RV32-NEXT: addi a3, a3, 1171
+; RV32-NEXT: mul a5, a0, a2
+; RV32-NEXT: mulhu a6, a0, a3
+; RV32-NEXT: mul a7, a1, a3
+; RV32-NEXT: add a5, a6, a5
+; RV32-NEXT: add a7, a5, a7
+; RV32-NEXT: sltu a6, a5, a6
+; RV32-NEXT: sltu a5, a7, a5
+; RV32-NEXT: mulhu a7, a0, a2
+; RV32-NEXT: mulhu a3, a1, a3
+; RV32-NEXT: add a6, a7, a6
+; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: add a7, a3, a5
+; RV32-NEXT: sltu a3, a3, a6
+; RV32-NEXT: seqz a6, a7
+; RV32-NEXT: and a5, a6, a5
+; RV32-NEXT: mul a6, a1, a2
+; RV32-NEXT: mulhu a2, a1, a2
+; RV32-NEXT: add a6, a7, a6
+; RV32-NEXT: sltu a7, a6, a7
+; RV32-NEXT: or a3, a3, a5
+; RV32-NEXT: sub a5, a0, a6
+; RV32-NEXT: add a2, a7, a2
+; RV32-NEXT: sltu a7, a0, a5
+; RV32-NEXT: srli a5, a5, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: sub a3, a1, a7
+; RV32-NEXT: sub a3, a3, a2
+; RV32-NEXT: slli a7, a3, 31
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: or a5, a7, a5
+; RV32-NEXT: add a2, a3, a2
+; RV32-NEXT: add a6, a5, a6
+; RV32-NEXT: sltu a3, a6, a5
+; RV32-NEXT: srli a5, a6, 2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: slli a3, a5, 3
+; RV32-NEXT: srli a6, a2, 2
+; RV32-NEXT: slli a2, a2, 30
+; RV32-NEXT: sub a3, a0, a3
+; RV32-NEXT: slli a7, a6, 3
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: sub a5, a7, a6
+; RV32-NEXT: mulhu a4, a2, a4
+; RV32-NEXT: add a2, a3, a2
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: sltu a0, a0, a2
+; RV32-NEXT: sub a1, a1, a0
+; RV32-NEXT: sub a1, a1, a4
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: test_urem_7:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: li a2, 7
-; RV64-NEXT: li a3, 0
-; RV64-NEXT: call __umodti3
-; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: lui a2, %hi(.LCPI2_0)
+; RV64-NEXT: lui a3, %hi(.LCPI2_1)
+; RV64-NEXT: ld a2, %lo(.LCPI2_0)(a2)
+; RV64-NEXT: ld a3, %lo(.LCPI2_1)(a3)
+; RV64-NEXT: li a4, 7
+; RV64-NEXT: mul a5, a0, a2
+; RV64-NEXT: mulhu a6, a0, a3
+; RV64-NEXT: mul a7, a1, a3
+; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: add a7, a5, a7
+; RV64-NEXT: sltu a6, a5, a6
+; RV64-NEXT: sltu a5, a7, a5
+; RV64-NEXT: mulhu a7, a0, a2
+; RV64-NEXT: mulhu a3, a1, a3
+; RV64-NEXT: add a6, a7, a6
+; RV64-NEXT: add a3, a6, a3
+; RV64-NEXT: add a7, a3, a5
+; RV64-NEXT: sltu a3, a3, a6
+; RV64-NEXT: seqz a6, a7
+; RV64-NEXT: and a5, a6, a5
+; RV64-NEXT: mul a6, a1, a2
+; RV64-NEXT: mulhu a2, a1, a2
+; RV64-NEXT: add a6, a7, a6
+; RV64-NEXT: sltu a7, a6, a7
+; RV64-NEXT: or a3, a3, a5
+; RV64-NEXT: sub a5, a0, a6
+; RV64-NEXT: add a2, a7, a2
+; RV64-NEXT: sltu a7, a0, a5
+; RV64-NEXT: srli a5, a5, 1
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: sub a3, a1, a7
+; RV64-NEXT: sub a3, a3, a2
+; RV64-NEXT: slli a7, a3, 63
+; RV64-NEXT: srli a3, a3, 1
+; RV64-NEXT: or a5, a7, a5
+; RV64-NEXT: add a2, a3, a2
+; RV64-NEXT: add a6, a5, a6
+; RV64-NEXT: sltu a3, a6, a5
+; RV64-NEXT: srli a5, a6, 2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: slli a3, a5, 3
+; RV64-NEXT: srli a6, a2, 2
+; RV64-NEXT: slli a2, a2, 62
+; RV64-NEXT: sub a3, a0, a3
+; RV64-NEXT: slli a7, a6, 3
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: sub a5, a7, a6
+; RV64-NEXT: mulhu a4, a2, a4
+; RV64-NEXT: add a2, a3, a2
+; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: sltu a0, a0, a2
+; RV64-NEXT: sub a1, a1, a0
+; RV64-NEXT: sub a1, a1, a4
+; RV64-NEXT: mv a0, a2
; RV64-NEXT: ret
%a = urem iXLen2 %x, 7
ret iXLen2 %a
@@ -105,24 +199,94 @@ define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
define iXLen2 @test_urem_9(iXLen2 %x) nounwind {
; RV32-LABEL: test_urem_9:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: li a2, 9
-; RV32-NEXT: li a3, 0
-; RV32-NEXT: call __umoddi3
-; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: lui a2, 932068
+; RV32-NEXT: lui a3, 582542
+; RV32-NEXT: li a4, 9
+; RV32-NEXT: addi a2, a2, -1821
+; RV32-NEXT: addi a3, a3, 911
+; RV32-NEXT: mul a5, a0, a2
+; RV32-NEXT: mulhu a6, a0, a3
+; RV32-NEXT: mul a7, a1, a3
+; RV32-NEXT: add a5, a6, a5
+; RV32-NEXT: add a7, a5, a7
+; RV32-NEXT: sltu a6, a5, a6
+; RV32-NEXT: sltu a5, a7, a5
+; RV32-NEXT: mulhu a7, a0, a2
+; RV32-NEXT: mulhu a3, a1, a3
+; RV32-NEXT: add a6, a7, a6
+; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: add a7, a3, a5
+; RV32-NEXT: sltu a3, a3, a6
+; RV32-NEXT: seqz a6, a7
+; RV32-NEXT: and a5, a6, a5
+; RV32-NEXT: mul a6, a1, a2
+; RV32-NEXT: mulhu a2, a1, a2
+; RV32-NEXT: add a6, a7, a6
+; RV32-NEXT: sltu a7, a6, a7
+; RV32-NEXT: or a3, a3, a5
+; RV32-NEXT: srli a5, a6, 3
+; RV32-NEXT: andi a6, a6, -8
+; RV32-NEXT: add a2, a7, a2
+; RV32-NEXT: sub a6, a0, a6
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: srli a3, a2, 3
+; RV32-NEXT: andi a7, a2, -8
+; RV32-NEXT: slli a2, a2, 29
+; RV32-NEXT: add a3, a7, a3
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: mulhu a4, a2, a4
+; RV32-NEXT: sub a2, a6, a2
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: sltu a0, a0, a2
+; RV32-NEXT: sub a1, a1, a0
+; RV32-NEXT: sub a1, a1, a3
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: test_urem_9:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: li a2, 9
-; RV64-NEXT: li a3, 0
-; RV64-NEXT: call __umodti3
-; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: lui a2, %hi(.LCPI3_0)
+; RV64-NEXT: lui a3, %hi(.LCPI3_1)
+; RV64-NEXT: ld a2, %lo(.LCPI3_0)(a2)
+; RV64-NEXT: ld a3, %lo(.LCPI3_1)(a3)
+; RV64-NEXT: li a4, 9
+; RV64-NEXT: mul a5, a0, a2
+; RV64-NEXT: mulhu a6, a0, a3
+; RV64-NEXT: mul a7, a1, a3
+; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: add a7, a5, a7
+; RV64-NEXT: sltu a6, a5, a6
+; RV64-NEXT: sltu a5, a7, a5
+; RV64-NEXT: mulhu a7, a0, a2
+; RV64-NEXT: mulhu a3, a1, a3
+; RV64-NEXT: add a6, a7, a6
+; RV64-NEXT: add a3, a6, a3
+; RV64-NEXT: add a7, a3, a5
+; RV64-NEXT: sltu a3, a3, a6
+; RV64-NEXT: seqz a6, a7
+; RV64-NEXT: and a5, a6, a5
+; RV64-NEXT: mul a6, a1, a2
+; RV64-NEXT: mulhu a2, a1, a2
+; RV64-NEXT: add a6, a7, a6
+; RV64-NEXT: sltu a7, a6, a7
+; RV64-NEXT: srli a6, a6, 1
+; RV64-NEXT: or a3, a3, a5
+; RV64-NEXT: add a2, a7, a2
+; RV64-NEXT: slli a5, a6, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: sub a3, a0, a5
+; RV64-NEXT: srli a5, a2, 1
+; RV64-NEXT: slli a2, a2, 63
+; RV64-NEXT: slli a7, a5, 3
+; RV64-NEXT: or a2, a2, a6
+; RV64-NEXT: add a5, a7, a5
+; RV64-NEXT: mulhu a4, a2, a4
+; RV64-NEXT: sub a2, a3, a2
+; RV64-NEXT: add a4, a4, a5
+; RV64-NEXT: sltu a0, a0, a2
+; RV64-NEXT: sub a1, a1, a0
+; RV64-NEXT: sub a1, a1, a4
+; RV64-NEXT: mv a0, a2
; RV64-NEXT: ret
%a = urem iXLen2 %x, 9
ret iXLen2 %a
diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index 017b2d36bdd58..ad6beaf47ed3e 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -232,13 +232,50 @@ define i64 @fold_urem_i64(i64 %x) nounwind {
;
; RV32IM-LABEL: fold_urem_i64:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: addi sp, sp, -16
-; RV32IM-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IM-NEXT: li a2, 98
-; RV32IM-NEXT: li a3, 0
-; RV32IM-NEXT: call __umoddi3
-; RV32IM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32IM-NEXT: addi sp, sp, 16
+; RV32IM-NEXT: srli a2, a0, 1
+; RV32IM-NEXT: slli a3, a1, 31
+; RV32IM-NEXT: lui a4, 342392
+; RV32IM-NEXT: lui a5, 770382
+; RV32IM-NEXT: srli a6, a1, 1
+; RV32IM-NEXT: or a2, a3, a2
+; RV32IM-NEXT: addi a3, a4, 668
+; RV32IM-NEXT: addi a4, a5, 1505
+; RV32IM-NEXT: mul a5, a2, a3
+; RV32IM-NEXT: mulhu a7, a2, a4
+; RV32IM-NEXT: mul t0, a6, a4
+; RV32IM-NEXT: mulhu a2, a2, a3
+; RV32IM-NEXT: mulhu a4, a6, a4
+; RV32IM-NEXT: mul t1, a6, a3
+; RV32IM-NEXT: mulhu a3, a6, a3
+; RV32IM-NEXT: add a5, a7, a5
+; RV32IM-NEXT: add t0, a5, t0
+; RV32IM-NEXT: sltu a6, a5, a7
+; RV32IM-NEXT: sltu a5, t0, a5
+; RV32IM-NEXT: li a7, 98
+; RV32IM-NEXT: add a2, a2, a6
+; RV32IM-NEXT: add a4, a2, a4
+; RV32IM-NEXT: add a6, a4, a5
+; RV32IM-NEXT: sltu a2, a4, a2
+; RV32IM-NEXT: seqz a4, a6
+; RV32IM-NEXT: add t1, a6, t1
+; RV32IM-NEXT: and a4, a4, a5
+; RV32IM-NEXT: sltu a5, t1, a6
+; RV32IM-NEXT: srli a6, t1, 4
+; RV32IM-NEXT: or a2, a2, a4
+; RV32IM-NEXT: add a3, a5, a3
+; RV32IM-NEXT: add a2, a3, a2
+; RV32IM-NEXT: srli a3, a2, 4
+; RV32IM-NEXT: slli a2, a2, 28
+; RV32IM-NEXT: mul a3, a3, a7
+; RV32IM-NEXT: or a2, a2, a6
+; RV32IM-NEXT: mulhu a4, a2, a7
+; RV32IM-NEXT: mul a2, a2, a7
+; RV32IM-NEXT: add a3, a4, a3
+; RV32IM-NEXT: sub a2, a0, a2
+; RV32IM-NEXT: sltu a0, a0, a2
+; RV32IM-NEXT: sub a1, a1, a3
+; RV32IM-NEXT: sub a1, a1, a0
+; RV32IM-NEXT: mv a0, a2
; RV32IM-NEXT: ret
;
; RV64I-LABEL: fold_urem_i64:
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index ec97e7a0ae558..e23ba1628bc9c 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -862,50 +862,162 @@ define <4 x i64> @fold_urem_i64(<4 x i64> %x) nounwind {
; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill
; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill
; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT: lw s1, 16(a1)
-; RV32IM-NEXT: lw s2, 20(a1)
-; RV32IM-NEXT: lw s3, 24(a1)
-; RV32IM-NEXT: lw s4, 28(a1)
-; RV32IM-NEXT: lw a3, 0(a1)
-; RV32IM-NEXT: lw a4, 4(a1)
-; RV32IM-NEXT: lw s5, 8(a1)
-; RV32IM-NEXT: lw s6, 12(a1)
+; RV32IM-NEXT: sw s9, 4(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mv a2, a1
; RV32IM-NEXT: mv s0, a0
+; RV32IM-NEXT: lw a4, 16(a1)
+; RV32IM-NEXT: lw a3, 20(a1)
+; RV32IM-NEXT: lw a6, 24(a1)
+; RV32IM-NEXT: lw a7, 28(a1)
+; RV32IM-NEXT: lw a0, 0(a1)
+; RV32IM-NEXT: lw a1, 4(a1)
+; RV32IM-NEXT: lw a5, 8(a2)
+; RV32IM-NEXT: lw a2, 12(a2)
+; RV32IM-NEXT: lui t0, 410312
+; RV32IM-NEXT: lui t1, 729444
+; RV32IM-NEXT: lui t2, 410452
+; RV32IM-NEXT: lui t4, 25653
+; RV32IM-NEXT: lui t5, 791991
+; RV32IM-NEXT: lui t6, 834723
+; RV32IM-NEXT: addi t3, t0, 1424
+; RV32IM-NEXT: addi s1, t1, 713
+; RV32IM-NEXT: addi s2, t2, -952
+; RV32IM-NEXT: addi t4, t4, 965
+; RV32IM-NEXT: addi t2, t5, 77
+; RV32IM-NEXT: addi t5, t6, -179
+; RV32IM-NEXT: mul t6, a4, t3
+; RV32IM-NEXT: mulhu s3, a4, s1
+; RV32IM-NEXT: mul s4, a3, s1
+; RV32IM-NEXT: mulhu s5, a4, t3
+; RV32IM-NEXT: srli t0, a5, 1
+; RV32IM-NEXT: slli t1, a2, 31
+; RV32IM-NEXT: srli s6, a2, 1
+; RV32IM-NEXT: or s7, t1, t0
+; RV32IM-NEXT: mul s8, s6, t4
+; RV32IM-NEXT: mulhu s9, s6, t4
+; RV32IM-NEXT: mul t1, s6, s2
+; RV32IM-NEXT: mulhu t0, s6, s2
+; RV32IM-NEXT: mul s6, s7, s2
+; RV32IM-NEXT: mulhu t4, s7, t4
+; RV32IM-NEXT: mulhu s2, s7, s2
+; RV32IM-NEXT: mul s7, a6, t2
+; RV32IM-NEXT: add t6, s3, t6
+; RV32IM-NEXT: add s4, t6, s4
+; RV32IM-NEXT: sltu s3, t6, s3
+; RV32IM-NEXT: sltu t6, s4, t6
+; RV32IM-NEXT: mulhu s4, a6, t5
+; RV32IM-NEXT: add s3, s5, s3
+; RV32IM-NEXT: mul s5, a7, t5
+; RV32IM-NEXT: add s7, s4, s7
+; RV32IM-NEXT: add s5, s7, s5
+; RV32IM-NEXT: sltu s4, s7, s4
+; RV32IM-NEXT: sltu s5, s5, s7
+; RV32IM-NEXT: mulhu s7, a6, t2
+; RV32IM-NEXT: add s4, s7, s4
+; RV32IM-NEXT: add s6, t4, s6
+; RV32IM-NEXT: add s8, s6, s8
+; RV32IM-NEXT: sltu t4, s6, t4
+; RV32IM-NEXT: sltu s6, s8, s6
+; RV32IM-NEXT: mulhu s1, a3, s1
+; RV32IM-NEXT: mulhu t5, a7, t5
+; RV32IM-NEXT: add s1, s3, s1
+; RV32IM-NEXT: add t5, s4, t5
+; RV32IM-NEXT: sltu s3, s1, s3
+; RV32IM-NEXT: add s1, s1, t6
+; RV32IM-NEXT: add t4, s2, t4
+; RV32IM-NEXT: add s2, t5, s5
+; RV32IM-NEXT: sltu t5, t5, s4
+; RV32IM-NEXT: seqz s4, s1
+; RV32IM-NEXT: and t6, s4, t6
+; RV32IM-NEXT: seqz s4, s2
+; RV32IM-NEXT: and s4, s4, s5
+; RV32IM-NEXT: or t6, s3, t6
+; RV32IM-NEXT: mul s3, a3, t3
+; RV32IM-NEXT: mulhu s5, a3, t3
+; RV32IM-NEXT: add t3, s1, s3
+; RV32IM-NEXT: sltu s1, t3, s1
+; RV32IM-NEXT: add s1, s1, s5
+; RV32IM-NEXT: or t5, t5, s4
+; RV32IM-NEXT: mul s3, a7, t2
+; RV32IM-NEXT: mulhu t2, a7, t2
+; RV32IM-NEXT: add s3, s2, s3
+; RV32IM-NEXT: sltu s2, s3, s2
+; RV32IM-NEXT: add t2, s2, t2
+; RV32IM-NEXT: add s9, t4, s9
+; RV32IM-NEXT: sltu t4, s9, t4
+; RV32IM-NEXT: add s9, s9, s6
+; RV32IM-NEXT: add t6, s1, t6
+; RV32IM-NEXT: seqz s1, s9
+; RV32IM-NEXT: and s1, s1, s6
+; RV32IM-NEXT: add t2, t2, t5
+; RV32IM-NEXT: or t4, t4, s1
+; RV32IM-NEXT: sub t5, a4, t3
+; RV32IM-NEXT: srli s1, s3, 12
+; RV32IM-NEXT: add t1, s9, t1
+; RV32IM-NEXT: sltu s2, t1, s9
+; RV32IM-NEXT: add t0, s2, t0
+; RV32IM-NEXT: sltu s2, a4, t5
+; RV32IM-NEXT: srli t5, t5, 1
+; RV32IM-NEXT: sub s2, a3, s2
+; RV32IM-NEXT: sub s2, s2, t6
+; RV32IM-NEXT: add t0, t0, t4
+; RV32IM-NEXT: slli t4, t2, 20
+; RV32IM-NEXT: or t4, t4, s1
+; RV32IM-NEXT: slli s1, s2, 31
+; RV32IM-NEXT: or t5, s1, t5
+; RV32IM-NEXT: lui s1, 1
+; RV32IM-NEXT: addi s1, s1, 1327
+; RV32IM-NEXT: srli t1, t1, 7
+; RV32IM-NEXT: srli t2, t2, 12
+; RV32IM-NEXT: srli s2, s2, 1
+; RV32IM-NEXT: mul t2, t2, s1
+; RV32IM-NEXT: add t6, s2, t6
+; RV32IM-NEXT: mulhu s2, t4, s1
+; RV32IM-NEXT: mul t4, t4, s1
+; RV32IM-NEXT: slli s1, t0, 25
+; RV32IM-NEXT: or t1, s1, t1
+; RV32IM-NEXT: li s1, 654
+; RV32IM-NEXT: srli t0, t0, 7
+; RV32IM-NEXT: mul t0, t0, s1
+; RV32IM-NEXT: add t2, s2, t2
+; RV32IM-NEXT: mulhu s2, t1, s1
+; RV32IM-NEXT: mul t1, t1, s1
+; RV32IM-NEXT: sub a7, a7, t2
+; RV32IM-NEXT: add t3, t5, t3
+; RV32IM-NEXT: sltu t2, t3, t5
+; RV32IM-NEXT: add t2, t6, t2
+; RV32IM-NEXT: li t5, 23
+; RV32IM-NEXT: sub s1, a6, t4
+; RV32IM-NEXT: srli t3, t3, 4
+; RV32IM-NEXT: sltu a6, a6, s1
+; RV32IM-NEXT: add t0, s2, t0
+; RV32IM-NEXT: sub s2, a5, t1
+; RV32IM-NEXT: srli t1, t2, 4
+; RV32IM-NEXT: slli t2, t2, 28
+; RV32IM-NEXT: sltu a5, a5, s2
+; RV32IM-NEXT: sub a2, a2, t0
+; RV32IM-NEXT: mul t0, t1, t5
+; RV32IM-NEXT: or t1, t2, t3
+; RV32IM-NEXT: sub s3, a2, a5
+; RV32IM-NEXT: mulhu a2, t1, t5
+; RV32IM-NEXT: mul a5, t1, t5
+; RV32IM-NEXT: add a2, a2, t0
+; RV32IM-NEXT: sub s4, a4, a5
+; RV32IM-NEXT: sltu a4, a4, s4
+; RV32IM-NEXT: sub a3, a3, a2
+; RV32IM-NEXT: sub s5, a3, a4
+; RV32IM-NEXT: sub s6, a7, a6
; RV32IM-NEXT: li a2, 1
-; RV32IM-NEXT: mv a0, a3
-; RV32IM-NEXT: mv a1, a4
; RV32IM-NEXT: li a3, 0
; RV32IM-NEXT: call __umoddi3
-; RV32IM-NEXT: mv s7, a0
-; RV32IM-NEXT: mv s8, a1
-; RV32IM-NEXT: li a2, 654
-; RV32IM-NEXT: mv a0, s5
-; RV32IM-NEXT: mv a1, s6
-; RV32IM-NEXT: li a3, 0
-; RV32IM-NEXT: call __umoddi3
-; RV32IM-NEXT: mv s5, a0
-; RV32IM-NEXT: mv s6, a1
-; RV32IM-NEXT: li a2, 23
-; RV32IM-NEXT: mv a0, s1
-; RV32IM-NEXT: mv a1, s2
-; RV32IM-NEXT: li a3, 0
-; RV32IM-NEXT: call __umoddi3
-; RV32IM-NEXT: mv s1, a0
-; RV32IM-NEXT: mv s2, a1
-; RV32IM-NEXT: lui a0, 1
-; RV32IM-NEXT: addi a2, a0, 1327
-; RV32IM-NEXT: mv a0, s3
-; RV32IM-NEXT: mv a1, s4
-; RV32IM-NEXT: li a3, 0
-; RV32IM-NEXT: call __umoddi3
-; RV32IM-NEXT: sw s1, 16(s0)
-; RV32IM-NEXT: sw s2, 20(s0)
-; RV32IM-NEXT: sw a0, 24(s0)
-; RV32IM-NEXT: sw a1, 28(s0)
-; RV32IM-NEXT: sw s7, 0(s0)
-; RV32IM-NEXT: sw s8, 4(s0)
-; RV32IM-NEXT: sw s5, 8(s0)
-; RV32IM-NEXT: sw s6, 12(s0)
+; RV32IM-NEXT: sw s4, 16(s0)
+; RV32IM-NEXT: sw s5, 20(s0)
+; RV32IM-NEXT: sw s1, 24(s0)
+; RV32IM-NEXT: sw s6, 28(s0)
+; RV32IM-NEXT: sw a0, 0(s0)
+; RV32IM-NEXT: sw a1, 4(s0)
+; RV32IM-NEXT: sw s2, 8(s0)
+; RV32IM-NEXT: sw s3, 12(s0)
; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
@@ -916,6 +1028,7 @@ define <4 x i64> @fold_urem_i64(<4 x i64> %x) nounwind {
; RV32IM-NEXT: lw s6, 16(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s7, 12(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s8, 8(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s9, 4(sp) # 4-byte Folded Reload
; RV32IM-NEXT: addi sp, sp, 48
; RV32IM-NEXT: ret
;
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index 14bcc22880697..d73c33ea506b0 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -294,19 +294,77 @@ entry:
define i64 @PR23590(i64 %x) nounwind {
; X86-LABEL: PR23590:
; X86: # %bb.0: # %entry
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $12345 # imm = 0x3039
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $7
-; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
; X86-NEXT: pushl %eax
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1425045447, %edx # imm = 0x54F077C7
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl $417841695, %edx # imm = 0x18E7C21F
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl $1425045447, %edx # imm = 0x54F077C7
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl $417841695, %edx # imm = 0x18E7C21F
+; X86-NEXT: mull %edx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: shrdl $12, %ebx, %edi
+; X86-NEXT: movl $12345, %edx # imm = 0x3039
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: shrl $12, %ebx
+; X86-NEXT: imull $12345, %ebx, %edi # imm = 0x3039
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: subl %eax, %esi
+; X86-NEXT: sbbl $0, %ecx
+; X86-NEXT: subl %edi, %ecx
+; X86-NEXT: movl $613566756, %ebx # imm = 0x24924924
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl $-1840700269, %edx # imm = 0x92492493
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl $-1840700269, %ecx # imm = 0x92492493
+; X86-NEXT: mull %ecx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl %edx, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-FAST-LABEL: PR23590:
@@ -347,27 +405,43 @@ define { i64, i32 } @PR38622(i64) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $-294967296 # imm = 0xEE6B2800
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $-294967296 # imm = 0xEE6B2800
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: shrdl $11, %edi, %ebx
+; X86-NEXT: movl $1125899, %edx # imm = 0x112E0B
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl $-400107883, %edx # imm = 0xE826D695
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: shrl $11, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl $1125899, %edx # imm = 0x112E0B
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl $-400107883, %edx # imm = 0xE826D695
+; X86-NEXT: mull %edx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl %edx, %esi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: shrdl $9, %ebx, %esi
+; X86-NEXT: imull $-294967296, %esi, %eax # imm = 0xEE6B2800
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: shrl $9, %ebx
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: addl $4, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -1165,13 +1239,41 @@ entry:
define i64 @udiv_i64_magic_large_postshift(i64 %x) nounwind {
; X86-LABEL: udiv_i64_magic_large_postshift:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $-1073741824 # imm = 0xC0000000
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl $-1431655766, %ecx # imm = 0xAAAAAAAA
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl $-1431655765, %esi # imm = 0xAAAAAAAB
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %ebp, %ebx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: shrl $31, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_magic_large_postshift:
@@ -1190,13 +1292,44 @@ define i64 @udiv_i64_magic_large_postshift(i64 %x) nounwind {
define i64 @urem_i64_magic_large_postshift(i64 %x) nounwind {
; X86-LABEL: urem_i64_magic_large_postshift:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $-1073741824 # imm = 0xC0000000
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl $-1431655766, %ebx # imm = 0xAAAAAAAA
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X86-NEXT: mull %edx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: shrl %ebx
+; X86-NEXT: andl $1073741824, %ebx # imm = 0x40000000
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: urem_i64_magic_large_postshift:
@@ -1217,13 +1350,24 @@ define i64 @urem_i64_magic_large_postshift(i64 %x) nounwind {
define i64 @udiv_i64_magic_large_preshift(i64 %x) nounwind {
; X86-LABEL: udiv_i64_magic_large_preshift:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $14
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shrl %ecx
+; X86-NEXT: movl $613566756, %edx # imm = 0x24924924
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl $-1840700269, %edx # imm = 0x92492493
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_magic_large_preshift:
@@ -1242,13 +1386,37 @@ define i64 @udiv_i64_magic_large_preshift(i64 %x) nounwind {
define i64 @urem_i64_magic_large_preshift(i64 %x) nounwind {
; X86-LABEL: urem_i64_magic_large_preshift:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $14
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %ebx
+; X86-NEXT: shrl %ebx
+; X86-NEXT: movl $613566756, %edx # imm = 0x24924924
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl $-1840700269, %edx # imm = 0x92492493
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: adcl $0, %eax
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: shll $4, %eax
+; X86-NEXT: subl %eax, %esi
+; X86-NEXT: addl %ebp, %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: urem_i64_magic_large_preshift:
@@ -1270,13 +1438,56 @@ define i64 @urem_i64_magic_large_preshift(i64 %x) nounwind {
define i64 @udiv_i64_magic_is_add(i64 %x) nounwind {
; X86-LABEL: udiv_i64_magic_is_add:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $196608 # imm = 0x30000
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1431626638, %edx # imm = 0x5554E38E
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl $1590754519, %edx # imm = 0x5ED0FCD7
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl $1431626638, %edx # imm = 0x5554E38E
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl $1590754519, %edx # imm = 0x5ED0FCD7
+; X86-NEXT: mull %edx
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: adcl %ebp, %edx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: leal (%edx,%ebx), %eax
+; X86-NEXT: subl %eax, %esi
+; X86-NEXT: sbbl $0, %ecx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: shrdl $1, %ecx, %esi
+; X86-NEXT: shrl %ecx
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: shrl $17, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_magic_is_add:
@@ -1297,13 +1508,64 @@ define i64 @udiv_i64_magic_is_add(i64 %x) nounwind {
define i64 @urem_i64_magic_is_add(i64 %x) nounwind {
; X86-LABEL: urem_i64_magic_is_add:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $196608 # imm = 0x30000
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl $1431626638, %ebx # imm = 0x5554E38E
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl $1590754519, %edx # imm = 0x5ED0FCD7
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl $1590754519, %edx # imm = 0x5ED0FCD7
+; X86-NEXT: mull %edx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: adcl %ebp, %edx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: leal (%edx,%ebx), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: sbbl $0, %ebp
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: sbbl %edi, %ebp
+; X86-NEXT: shrdl $1, %ebp, %ecx
+; X86-NEXT: shrl %ebp
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: shrl $17, %ebp
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: shll $16, %ebp
+; X86-NEXT: leal (%ebp,%ebp,2), %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: sbbl $0, %esi
+; X86-NEXT: subl %ecx, %esi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: urem_i64_magic_is_add:
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 9d54452404fb0..d05dc7c62f2e1 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -67,25 +67,93 @@ define i64 @div128(i128 %x) nounwind {
define i64 @umod128(i128 %x) nounwind {
; X86-64-LABEL: umod128:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movl $11, %edx
-; X86-64-NEXT: xorl %ecx, %ecx
-; X86-64-NEXT: callq __umodti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movabsq $8384883669867978007, %r10 # imm = 0x745D1745D1745D17
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r10
+; X86-64-NEXT: movq %rax, %rcx
+; X86-64-NEXT: movq %rdx, %r8
+; X86-64-NEXT: movabsq $5030930201920786805, %r11 # imm = 0x45D1745D1745D175
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: movq %rdx, %r9
+; X86-64-NEXT: addq %rcx, %r9
+; X86-64-NEXT: adcq $0, %r8
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r10
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movq %rax, %r10
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: addq %r9, %rax
+; X86-64-NEXT: adcq %r8, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: addq %r10, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: subq %rdx, %rax
+; X86-64-NEXT: sbbq $0, %rsi
+; X86-64-NEXT: subq %rcx, %rsi
+; X86-64-NEXT: movl %esi, %r8d
+; X86-64-NEXT: shrl %r8d
+; X86-64-NEXT: shldq $63, %rax, %rsi
+; X86-64-NEXT: xorl %eax, %eax
+; X86-64-NEXT: addq %rdx, %rsi
+; X86-64-NEXT: setb %al
+; X86-64-NEXT: addl %r8d, %ecx
+; X86-64-NEXT: addl %eax, %ecx
+; X86-64-NEXT: shldq $61, %rsi, %rcx
+; X86-64-NEXT: leaq (%rcx,%rcx,4), %rax
+; X86-64-NEXT: leaq (%rcx,%rax,2), %rax
+; X86-64-NEXT: subq %rax, %rdi
+; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: retq
;
; WIN64-LABEL: umod128:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __umodti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movabsq $8384883669867978007, %rsi # imm = 0x745D1745D1745D17
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rsi
+; WIN64-NEXT: movq %rax, %r9
+; WIN64-NEXT: movq %rdx, %r10
+; WIN64-NEXT: movabsq $5030930201920786805, %rdi # imm = 0x45D1745D1745D175
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: movq %rdx, %r11
+; WIN64-NEXT: addq %r9, %r11
+; WIN64-NEXT: adcq $0, %r10
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rsi
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movq %rax, %rsi
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: addq %r11, %rax
+; WIN64-NEXT: adcq %r10, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: addq %rsi, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: subq %rdx, %rax
+; WIN64-NEXT: sbbq $0, %r8
+; WIN64-NEXT: subq %r9, %r8
+; WIN64-NEXT: movl %r8d, %r10d
+; WIN64-NEXT: shrl %r10d
+; WIN64-NEXT: shldq $63, %rax, %r8
+; WIN64-NEXT: xorl %eax, %eax
+; WIN64-NEXT: addq %rdx, %r8
+; WIN64-NEXT: setb %al
+; WIN64-NEXT: addl %r10d, %r9d
+; WIN64-NEXT: addl %eax, %r9d
+; WIN64-NEXT: shldq $61, %r8, %r9
+; WIN64-NEXT: leaq (%r9,%r9,4), %rax
+; WIN64-NEXT: leaq (%r9,%rax,2), %rax
+; WIN64-NEXT: subq %rax, %rcx
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
; WIN64-NEXT: retq
@@ -1018,27 +1086,70 @@ entry:
define i128 @udiv_magic_preshift_and_postshift(i128 %x) nounwind {
; X86-64-LABEL: udiv_magic_preshift_and_postshift:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movl $22, %edx
-; X86-64-NEXT: xorl %ecx, %ecx
-; X86-64-NEXT: callq __udivti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: shrdq $1, %rsi, %rdi
+; X86-64-NEXT: movabsq $-5030930201920786805, %r9 # imm = 0xBA2E8BA2E8BA2E8B
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r9
+; X86-64-NEXT: movq %rax, %r8
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movabsq $-6707906935894382405, %r10 # imm = 0xA2E8BA2E8BA2E8BB
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r10
+; X86-64-NEXT: movq %rdx, %rdi
+; X86-64-NEXT: addq %r8, %rdi
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: shrq %rsi
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r9
+; X86-64-NEXT: movq %rdx, %r8
+; X86-64-NEXT: movq %rax, %r9
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r10
+; X86-64-NEXT: addq %rdi, %rax
+; X86-64-NEXT: adcq %rdx, %rcx
+; X86-64-NEXT: adcq $0, %r8
+; X86-64-NEXT: addq %r9, %rcx
+; X86-64-NEXT: adcq $0, %r8
+; X86-64-NEXT: shrdq $3, %r8, %rcx
+; X86-64-NEXT: shrq $3, %r8
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: movq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-LABEL: udiv_magic_preshift_and_postshift:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $22, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __udivti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq %rcx, %r9
+; WIN64-NEXT: shrdq $1, %rdx, %r9
+; WIN64-NEXT: movabsq $-5030930201920786805, %r11 # imm = 0xBA2E8BA2E8BA2E8B
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: mulq %r11
+; WIN64-NEXT: movq %rax, %r10
+; WIN64-NEXT: movq %rdx, %rcx
+; WIN64-NEXT: movabsq $-6707906935894382405, %rsi # imm = 0xA2E8BA2E8BA2E8BB
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: mulq %rsi
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: addq %r10, %r9
+; WIN64-NEXT: adcq $0, %rcx
+; WIN64-NEXT: shrq %r8
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %r11
+; WIN64-NEXT: movq %rdx, %r10
+; WIN64-NEXT: movq %rax, %r11
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rsi
+; WIN64-NEXT: addq %r9, %rax
+; WIN64-NEXT: adcq %rdx, %rcx
+; WIN64-NEXT: adcq $0, %r10
+; WIN64-NEXT: addq %r11, %rcx
+; WIN64-NEXT: adcq $0, %r10
+; WIN64-NEXT: shrdq $3, %r10, %rcx
+; WIN64-NEXT: shrq $3, %r10
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r10, %rdx
+; WIN64-NEXT: popq %rsi
; WIN64-NEXT: retq
%ret = udiv i128 %x, 22
ret i128 %ret
@@ -1048,27 +1159,99 @@ define i128 @udiv_magic_preshift_and_postshift(i128 %x) nounwind {
define i128 @urem_magic_preshift_and_postshift(i128 %x) nounwind {
; X86-64-LABEL: urem_magic_preshift_and_postshift:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: pushq %rbx
+; X86-64-NEXT: movq %rdi, %r8
+; X86-64-NEXT: shrdq $1, %rsi, %r8
+; X86-64-NEXT: movabsq $-5030930201920786805, %r11 # imm = 0xBA2E8BA2E8BA2E8B
+; X86-64-NEXT: movq %r8, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: movq %rax, %r9
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movabsq $-6707906935894382405, %rbx # imm = 0xA2E8BA2E8BA2E8BB
+; X86-64-NEXT: movq %r8, %rax
+; X86-64-NEXT: mulq %rbx
+; X86-64-NEXT: movq %rdx, %r10
+; X86-64-NEXT: addq %r9, %r10
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: movq %rsi, %r9
+; X86-64-NEXT: shrq %r9
+; X86-64-NEXT: movq %r9, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: movq %rdx, %r8
+; X86-64-NEXT: movq %rax, %r11
+; X86-64-NEXT: movq %r9, %rax
+; X86-64-NEXT: mulq %rbx
+; X86-64-NEXT: addq %r10, %rax
+; X86-64-NEXT: adcq %rdx, %rcx
+; X86-64-NEXT: adcq $0, %r8
+; X86-64-NEXT: addq %r11, %rcx
+; X86-64-NEXT: adcq $0, %r8
+; X86-64-NEXT: shrdq $3, %r8, %rcx
+; X86-64-NEXT: shrq $3, %r8
+; X86-64-NEXT: leaq (%r8,%r8,4), %rax
+; X86-64-NEXT: leaq (%r8,%rax,4), %r9
; X86-64-NEXT: movl $22, %edx
-; X86-64-NEXT: xorl %ecx, %ecx
-; X86-64-NEXT: callq __umodti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: mulq %rdx
+; X86-64-NEXT: addq %r8, %rdx
+; X86-64-NEXT: addq %r9, %rdx
+; X86-64-NEXT: subq %rax, %rdi
+; X86-64-NEXT: sbbq $0, %rsi
+; X86-64-NEXT: subq %rdx, %rsi
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: movq %rsi, %rdx
+; X86-64-NEXT: popq %rbx
; X86-64-NEXT: retq
;
; WIN64-LABEL: urem_magic_preshift_and_postshift:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $22, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __umodti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: pushq %rbx
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq %rcx, %r10
+; WIN64-NEXT: shrdq $1, %rdx, %r10
+; WIN64-NEXT: movabsq $-5030930201920786805, %rdi # imm = 0xBA2E8BA2E8BA2E8B
+; WIN64-NEXT: movq %r10, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: movq %rax, %r11
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movabsq $-6707906935894382405, %rbx # imm = 0xA2E8BA2E8BA2E8BB
+; WIN64-NEXT: movq %r10, %rax
+; WIN64-NEXT: mulq %rbx
+; WIN64-NEXT: movq %rdx, %rsi
+; WIN64-NEXT: addq %r11, %rsi
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: movq %r8, %r11
+; WIN64-NEXT: shrq %r11
+; WIN64-NEXT: movq %r11, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: movq %rdx, %r10
+; WIN64-NEXT: movq %rax, %rdi
+; WIN64-NEXT: movq %r11, %rax
+; WIN64-NEXT: mulq %rbx
+; WIN64-NEXT: addq %rsi, %rax
+; WIN64-NEXT: adcq %rdx, %r9
+; WIN64-NEXT: adcq $0, %r10
+; WIN64-NEXT: addq %rdi, %r9
+; WIN64-NEXT: adcq $0, %r10
+; WIN64-NEXT: shrdq $3, %r10, %r9
+; WIN64-NEXT: shrq $3, %r10
+; WIN64-NEXT: leaq (%r10,%r10,4), %rax
+; WIN64-NEXT: leaq (%r10,%rax,4), %r11
+; WIN64-NEXT: movl $22, %edx
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: addq %r10, %rdx
+; WIN64-NEXT: addq %r11, %rdx
+; WIN64-NEXT: subq %rax, %rcx
+; WIN64-NEXT: sbbq $0, %r8
+; WIN64-NEXT: subq %rdx, %r8
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r8, %rdx
+; WIN64-NEXT: popq %rbx
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
; WIN64-NEXT: retq
%ret = urem i128 %x, 22
ret i128 %ret
@@ -1078,28 +1261,37 @@ define i128 @urem_magic_preshift_and_postshift(i128 %x) nounwind {
define i128 @udiv_magic_large_preshift(i128 %x) nounwind {
; X86-64-LABEL: udiv_magic_large_preshift:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movabsq $755914244096, %rcx # imm = 0xB000000000
+; X86-64-NEXT: shrq $36, %rsi
+; X86-64-NEXT: movabsq $1676976733973595601, %rcx # imm = 0x1745D1745D1745D1
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %rcx
+; X86-64-NEXT: movq %rax, %rcx
+; X86-64-NEXT: movq %rdx, %rdi
+; X86-64-NEXT: movabsq $8384883669867978008, %rdx # imm = 0x745D1745D1745D18
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %rdx
+; X86-64-NEXT: addq %rcx, %rdx
+; X86-64-NEXT: adcq $0, %rdi
+; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: xorl %edx, %edx
-; X86-64-NEXT: callq __udivti3 at PLT
-; X86-64-NEXT: popq %rcx
; X86-64-NEXT: retq
;
; WIN64-LABEL: udiv_magic_large_preshift:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000
-; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __udivti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: movq %rdx, %rcx
+; WIN64-NEXT: shrq $36, %rcx
+; WIN64-NEXT: movabsq $1676976733973595601, %rdx # imm = 0x1745D1745D1745D1
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: movq %rax, %r8
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movabsq $8384883669867978008, %rdx # imm = 0x745D1745D1745D18
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: addq %r8, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
%ret = udiv i128 %x, 13944156602510523416463735259136 ; = 11 * 2^100
ret i128 %ret
@@ -1109,28 +1301,45 @@ define i128 @udiv_magic_large_preshift(i128 %x) nounwind {
define i128 @urem_magic_large_preshift(i128 %x) nounwind {
; X86-64-LABEL: urem_magic_large_preshift:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movabsq $755914244096, %rcx # imm = 0xB000000000
-; X86-64-NEXT: xorl %edx, %edx
-; X86-64-NEXT: callq __umodti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq %rsi, %rcx
+; X86-64-NEXT: shrq $36, %rcx
+; X86-64-NEXT: movabsq $1676976733973595601, %rdx # imm = 0x1745D1745D1745D1
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: mulq %rdx
+; X86-64-NEXT: movq %rax, %r8
+; X86-64-NEXT: movq %rdx, %r9
+; X86-64-NEXT: movabsq $8384883669867978008, %rdx # imm = 0x745D1745D1745D18
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: mulq %rdx
+; X86-64-NEXT: addq %r8, %rdx
+; X86-64-NEXT: adcq $0, %r9
+; X86-64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000
+; X86-64-NEXT: imulq %r9, %rax
+; X86-64-NEXT: subq %rax, %rsi
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: movq %rsi, %rdx
; X86-64-NEXT: retq
;
; WIN64-LABEL: urem_magic_large_preshift:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: shrq $36, %r9
+; WIN64-NEXT: movabsq $1676976733973595601, %rdx # imm = 0x1745D1745D1745D1
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: movq %rax, %r10
+; WIN64-NEXT: movq %rdx, %r11
+; WIN64-NEXT: movabsq $8384883669867978008, %rdx # imm = 0x745D1745D1745D18
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: addq %r10, %rdx
+; WIN64-NEXT: adcq $0, %r11
; WIN64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000
-; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __umodti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: imulq %r11, %rax
+; WIN64-NEXT: subq %rax, %r8
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r8, %rdx
; WIN64-NEXT: retq
%ret = urem i128 %x, 13944156602510523416463735259136 ; = 11 * 2^100
ret i128 %ret
@@ -1140,27 +1349,39 @@ define i128 @urem_magic_large_preshift(i128 %x) nounwind {
define i128 @udiv_magic_large_postshift(i128 %x) nounwind {
; X86-64-LABEL: udiv_magic_large_postshift:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movl $1, %edx
-; X86-64-NEXT: movl $1, %ecx
-; X86-64-NEXT: callq __udivti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq $-1, %r9
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r9
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movq %rax, %r8
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r9
+; X86-64-NEXT: addq %rsi, %rax
+; X86-64-NEXT: adcq $0, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: addq %r8, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
;
; WIN64-LABEL: udiv_magic_large_postshift:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __udivti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq $-1, %r11
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: mulq %r11
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movq %rax, %r10
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %r11
+; WIN64-NEXT: addq %r8, %rax
+; WIN64-NEXT: adcq $0, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: addq %r10, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
%ret = udiv i128 %x, 18446744073709551617 ; = 2^64 + 1
ret i128 %ret
@@ -1170,27 +1391,45 @@ define i128 @udiv_magic_large_postshift(i128 %x) nounwind {
define i128 @urem_magic_large_postshift(i128 %x) nounwind {
; X86-64-LABEL: urem_magic_large_postshift:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movl $1, %edx
-; X86-64-NEXT: movl $1, %ecx
-; X86-64-NEXT: callq __umodti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq $-1, %r9
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r9
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movq %rax, %r8
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r9
+; X86-64-NEXT: addq %rsi, %rax
+; X86-64-NEXT: adcq $0, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: addq %r8, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: subq %rcx, %rdi
+; X86-64-NEXT: sbbq $0, %rsi
+; X86-64-NEXT: subq %rcx, %rsi
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: movq %rsi, %rdx
; X86-64-NEXT: retq
;
; WIN64-LABEL: urem_magic_large_postshift:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __umodti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq $-1, %r11
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: mulq %r11
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movq %rax, %r10
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %r11
+; WIN64-NEXT: addq %r8, %rax
+; WIN64-NEXT: adcq $0, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: addq %r10, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: subq %r9, %rcx
+; WIN64-NEXT: sbbq $0, %r8
+; WIN64-NEXT: subq %r9, %r8
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r8, %rdx
; WIN64-NEXT: retq
%ret = urem i128 %x, 18446744073709551617 ; = 2^64 + 1
ret i128 %ret
@@ -1200,28 +1439,85 @@ define i128 @urem_magic_large_postshift(i128 %x) nounwind {
define i128 @udiv_magic_is_add(i128 %x) nounwind {
; X86-64-LABEL: udiv_magic_is_add:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X86-64-NEXT: movl $1, %edx
-; X86-64-NEXT: callq __udivti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq $-1, %r8
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: movq %rax, %rcx
+; X86-64-NEXT: movq %rdx, %r9
+; X86-64-NEXT: movq $-3, %r11
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: movq %rdx, %r10
+; X86-64-NEXT: addq %rcx, %r10
+; X86-64-NEXT: adcq $0, %r9
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movq %rax, %r8
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: addq %r10, %rax
+; X86-64-NEXT: adcq %r9, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: leaq (%rdx,%r8), %rax
+; X86-64-NEXT: subq %rax, %rdi
+; X86-64-NEXT: sbbq $0, %rsi
+; X86-64-NEXT: movq %rdx, %rax
+; X86-64-NEXT: addq %r8, %rax
+; X86-64-NEXT: sbbq %rcx, %rsi
+; X86-64-NEXT: shrdq $1, %rsi, %rdi
+; X86-64-NEXT: shrq %rsi
+; X86-64-NEXT: addq %rax, %rdi
+; X86-64-NEXT: adcq $0, %rsi
+; X86-64-NEXT: addq %r8, %rdx
+; X86-64-NEXT: adcq %rsi, %rcx
+; X86-64-NEXT: shrq $63, %rcx
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
;
; WIN64-LABEL: udiv_magic_is_add:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __udivti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq $-1, %r10
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: movq %rax, %r9
+; WIN64-NEXT: movq %rdx, %r11
+; WIN64-NEXT: movq $-3, %rdi
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: movq %rdx, %rsi
+; WIN64-NEXT: addq %r9, %rsi
+; WIN64-NEXT: adcq $0, %r11
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movq %rax, %r10
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: addq %rsi, %rax
+; WIN64-NEXT: adcq %r11, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: leaq (%rdx,%r10), %rax
+; WIN64-NEXT: subq %rax, %rcx
+; WIN64-NEXT: sbbq $0, %r8
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: addq %r10, %rax
+; WIN64-NEXT: sbbq %r9, %r8
+; WIN64-NEXT: shrdq $1, %r8, %rcx
+; WIN64-NEXT: shrq %r8
+; WIN64-NEXT: addq %rax, %rcx
+; WIN64-NEXT: adcq $0, %r8
+; WIN64-NEXT: addq %r10, %rdx
+; WIN64-NEXT: adcq %r9, %r8
+; WIN64-NEXT: shrq $63, %r8
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: xorl %edx, %edx
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
; WIN64-NEXT: retq
%ret = udiv i128 %x, 170141183460469231731687303715884105729 ; = 2^127 + 1
ret i128 %ret
@@ -1231,28 +1527,98 @@ define i128 @udiv_magic_is_add(i128 %x) nounwind {
define i128 @urem_magic_is_add(i128 %x) nounwind {
; X86-64-LABEL: urem_magic_is_add:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X86-64-NEXT: movl $1, %edx
-; X86-64-NEXT: callq __umodti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq $-1, %r8
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: movq %rax, %rcx
+; X86-64-NEXT: movq %rdx, %r9
+; X86-64-NEXT: movq $-3, %r11
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: movq %rdx, %r10
+; X86-64-NEXT: addq %rcx, %r10
+; X86-64-NEXT: adcq $0, %r9
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movq %rax, %r8
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: addq %r10, %rax
+; X86-64-NEXT: adcq %r9, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: leaq (%rdx,%r8), %rax
+; X86-64-NEXT: movq %rdi, %r9
+; X86-64-NEXT: subq %rax, %r9
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: sbbq $0, %rax
+; X86-64-NEXT: movq %rdx, %r10
+; X86-64-NEXT: addq %r8, %r10
+; X86-64-NEXT: sbbq %rcx, %rax
+; X86-64-NEXT: shrdq $1, %rax, %r9
+; X86-64-NEXT: shrq %rax
+; X86-64-NEXT: addq %r10, %r9
+; X86-64-NEXT: adcq $0, %rax
+; X86-64-NEXT: addq %r8, %rdx
+; X86-64-NEXT: adcq %rcx, %rax
+; X86-64-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; X86-64-NEXT: andq %rax, %rdx
+; X86-64-NEXT: shrq $63, %rax
+; X86-64-NEXT: subq %rax, %rdi
+; X86-64-NEXT: sbbq $0, %rsi
+; X86-64-NEXT: addq %rsi, %rdx
+; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: retq
;
; WIN64-LABEL: urem_magic_is_add:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __umodti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq $-1, %r10
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: movq %rax, %r9
+; WIN64-NEXT: movq %rdx, %r11
+; WIN64-NEXT: movq $-3, %rdi
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: movq %rdx, %rsi
+; WIN64-NEXT: addq %r9, %rsi
+; WIN64-NEXT: adcq $0, %r11
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movq %rax, %r10
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: addq %rsi, %rax
+; WIN64-NEXT: adcq %r11, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: leaq (%rdx,%r10), %rax
+; WIN64-NEXT: movq %rcx, %r11
+; WIN64-NEXT: subq %rax, %r11
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: sbbq $0, %rax
+; WIN64-NEXT: movq %rdx, %rsi
+; WIN64-NEXT: addq %r10, %rsi
+; WIN64-NEXT: sbbq %r9, %rax
+; WIN64-NEXT: shrdq $1, %rax, %r11
+; WIN64-NEXT: shrq %rax
+; WIN64-NEXT: addq %rsi, %r11
+; WIN64-NEXT: adcq $0, %rax
+; WIN64-NEXT: addq %r10, %rdx
+; WIN64-NEXT: adcq %r9, %rax
+; WIN64-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; WIN64-NEXT: andq %rax, %rdx
+; WIN64-NEXT: shrq $63, %rax
+; WIN64-NEXT: subq %rax, %rcx
+; WIN64-NEXT: sbbq $0, %r8
+; WIN64-NEXT: addq %rdx, %r8
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r8, %rdx
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
; WIN64-NEXT: retq
%ret = urem i128 %x, 170141183460469231731687303715884105729 ; = 2^127 + 1
ret i128 %ret
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index df97f49440f74..a4b06d6af19bf 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -152,40 +152,57 @@ declare i37 @llvm.fshl.i37(i37, i37, i37)
define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind {
; X86-SSE2-LABEL: fshl_i37:
; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: andl $31, %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE2-NEXT: andl $31, %esi
+; X86-SSE2-NEXT: movl $116080197, %edx # imm = 0x6EB3E45
+; X86-SSE2-NEXT: movl %ecx, %eax
+; X86-SSE2-NEXT: mull %edx
+; X86-SSE2-NEXT: movl %eax, %ebx
+; X86-SSE2-NEXT: movl %edx, %edi
+; X86-SSE2-NEXT: movl $812561381, %edx # imm = 0x306EB3E5
+; X86-SSE2-NEXT: movl %ecx, %eax
+; X86-SSE2-NEXT: mull %edx
+; X86-SSE2-NEXT: movl %edx, %ebp
+; X86-SSE2-NEXT: addl %ebx, %ebp
+; X86-SSE2-NEXT: adcl $0, %edi
+; X86-SSE2-NEXT: movl %esi, %eax
+; X86-SSE2-NEXT: movl $812561381, %edx # imm = 0x306EB3E5
+; X86-SSE2-NEXT: mull %edx
+; X86-SSE2-NEXT: addl %ebp, %eax
+; X86-SSE2-NEXT: adcl %edi, %edx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: imull $116080197, %esi, %esi # imm = 0x6EB3E45
+; X86-SSE2-NEXT: addl %edx, %esi
+; X86-SSE2-NEXT: leal (%esi,%esi,8), %edx
+; X86-SSE2-NEXT: leal (%esi,%edx,4), %edx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT: shldl $27, %ebx, %edi
-; X86-SSE2-NEXT: pushl $0
-; X86-SSE2-NEXT: pushl $37
-; X86-SSE2-NEXT: pushl %eax
-; X86-SSE2-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: calll __umoddi3
-; X86-SSE2-NEXT: addl $16, %esp
-; X86-SSE2-NEXT: movl %eax, %ecx
+; X86-SSE2-NEXT: subl %edx, %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT: shldl $27, %edi, %esi
; X86-SSE2-NEXT: testb $32, %cl
; X86-SSE2-NEXT: jne .LBB3_1
; X86-SSE2-NEXT: # %bb.2:
-; X86-SSE2-NEXT: movl %edi, %ebx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE2-NEXT: movl %esi, %edi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT: movl %eax, %esi
; X86-SSE2-NEXT: jmp .LBB3_3
; X86-SSE2-NEXT: .LBB3_1:
-; X86-SSE2-NEXT: shll $27, %ebx
+; X86-SSE2-NEXT: shll $27, %edi
+; X86-SSE2-NEXT: movl %eax, %edx
; X86-SSE2-NEXT: .LBB3_3:
-; X86-SSE2-NEXT: movl %edi, %eax
-; X86-SSE2-NEXT: shldl %cl, %ebx, %eax
+; X86-SSE2-NEXT: movl %esi, %eax
+; X86-SSE2-NEXT: shldl %cl, %edi, %eax
; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT: shldl %cl, %edi, %esi
-; X86-SSE2-NEXT: movl %esi, %edx
+; X86-SSE2-NEXT: shldl %cl, %esi, %edx
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
+; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X64-AVX-LABEL: fshl_i37:
@@ -318,41 +335,58 @@ declare i37 @llvm.fshr.i37(i37, i37, i37)
define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
; X86-SSE2-LABEL: fshr_i37:
; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: andl $31, %eax
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT: shldl $27, %ebx, %esi
-; X86-SSE2-NEXT: pushl $0
-; X86-SSE2-NEXT: pushl $37
-; X86-SSE2-NEXT: pushl %eax
-; X86-SSE2-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: calll __umoddi3
-; X86-SSE2-NEXT: addl $16, %esp
-; X86-SSE2-NEXT: movl %eax, %ecx
-; X86-SSE2-NEXT: addl $27, %ecx
+; X86-SSE2-NEXT: andl $31, %esi
+; X86-SSE2-NEXT: movl $116080197, %edx # imm = 0x6EB3E45
+; X86-SSE2-NEXT: movl %ebp, %eax
+; X86-SSE2-NEXT: mull %edx
+; X86-SSE2-NEXT: movl %eax, %ebx
+; X86-SSE2-NEXT: movl %edx, %edi
+; X86-SSE2-NEXT: movl $812561381, %ecx # imm = 0x306EB3E5
+; X86-SSE2-NEXT: movl %ebp, %eax
+; X86-SSE2-NEXT: mull %ecx
+; X86-SSE2-NEXT: movl %edx, %ebp
+; X86-SSE2-NEXT: addl %ebx, %ebp
+; X86-SSE2-NEXT: adcl $0, %edi
+; X86-SSE2-NEXT: movl %esi, %eax
+; X86-SSE2-NEXT: mull %ecx
+; X86-SSE2-NEXT: movl %edx, %ebx
+; X86-SSE2-NEXT: addl %ebp, %eax
+; X86-SSE2-NEXT: adcl %edi, %ebx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT: imull $116080197, %esi, %eax # imm = 0x6EB3E45
+; X86-SSE2-NEXT: addl %ebx, %eax
+; X86-SSE2-NEXT: leal (%eax,%eax,8), %ecx
+; X86-SSE2-NEXT: leal (%eax,%ecx,4), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE2-NEXT: negl %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: leal 27(%ecx,%eax), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: shldl $27, %edi, %eax
; X86-SSE2-NEXT: testb $32, %cl
; X86-SSE2-NEXT: je .LBB10_1
; X86-SSE2-NEXT: # %bb.2:
-; X86-SSE2-NEXT: movl %edi, %edx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SSE2-NEXT: jmp .LBB10_3
; X86-SSE2-NEXT: .LBB10_1:
-; X86-SSE2-NEXT: shll $27, %ebx
-; X86-SSE2-NEXT: movl %esi, %edx
-; X86-SSE2-NEXT: movl %ebx, %esi
+; X86-SSE2-NEXT: shll $27, %edi
+; X86-SSE2-NEXT: movl %edx, %esi
+; X86-SSE2-NEXT: movl %eax, %edx
+; X86-SSE2-NEXT: movl %edi, %eax
; X86-SSE2-NEXT: .LBB10_3:
-; X86-SSE2-NEXT: shrdl %cl, %edx, %esi
+; X86-SSE2-NEXT: shrdl %cl, %edx, %eax
; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT: shrdl %cl, %edi, %edx
-; X86-SSE2-NEXT: movl %esi, %eax
+; X86-SSE2-NEXT: shrdl %cl, %esi, %edx
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
+; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X64-AVX-LABEL: fshr_i37:
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
index 9011832421326..a59861de08fdb 100644
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -42,11 +42,19 @@ define i128 @test2(i128 %x) nounwind {
;
; X64-LABEL: test2:
; X64: # %bb.0:
-; X64-NEXT: pushq %rax
+; X64-NEXT: shrq $2, %rsi
+; X64-NEXT: movl $4, %ecx
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movl $17, %edx
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %rdx
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: movq $-4, %rcx
-; X64-NEXT: callq __udivti3 at PLT
-; X64-NEXT: popq %rcx
; X64-NEXT: retq
%tmp = udiv i128 %x, -73786976294838206464
ret i128 %tmp
@@ -59,11 +67,31 @@ define i128 @test3(i128 %x) nounwind {
;
; X64-LABEL: test3:
; X64: # %bb.0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movq $-3, %rdx
-; X64-NEXT: movq $-5, %rcx
-; X64-NEXT: callq __udivti3 at PLT
-; X64-NEXT: popq %rcx
+; X64-NEXT: movabsq $4611686018427387905, %r9 # imm = 0x4000000000000001
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movl $5, %r10d
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r10
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: addq %rcx, %rdi
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r10
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: adcq %r8, %rdx
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: addq %r9, %rdx
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: shrq $62, %rcx
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
%tmp = udiv i128 %x, -73786976294838206467
ret i128 %tmp
More information about the llvm-commits
mailing list