[llvm] [SelectionDAG] Use Magic Algorithm for Splitting UDIV/UREM by Constant (PR #154968)
Marius Kamp via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 26 04:36:03 PDT 2026
https://github.com/mskamp updated https://github.com/llvm/llvm-project/pull/154968
>From 2aedab8b5df686dd7052387db6754d7a97c5a347 Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Tue, 12 Aug 2025 18:05:50 +0200
Subject: [PATCH 1/4] [SelectionDAG] Add Tests for Large UDIV/UREM by Constant;
NFC
---
llvm/test/CodeGen/X86/divide-by-constant.ll | 164 +++++++++++++
llvm/test/CodeGen/X86/divmod128.ll | 244 ++++++++++++++++++++
2 files changed, 408 insertions(+)
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index e0bff50e2e2dd..b0ecae63b9bfb 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -1190,6 +1190,170 @@ entry:
ret i64 %rem
}
+; PR137514
+define i64 @udiv_i64_magic_large_postshift(i64 %x) nounwind {
+; X86-LABEL: udiv_i64_magic_large_postshift:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl $-1073741824 # imm = 0xC0000000
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll __udivdi3
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i64_magic_large_postshift:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: shrq $63, %rax
+; X64-NEXT: retq
+ %ret = udiv i64 %x, 13835058055282163712 ; = 3 * 2^62
+ ret i64 %ret
+}
+
+; PR137514
+define i64 @urem_i64_magic_large_postshift(i64 %x) nounwind {
+; X86-LABEL: urem_i64_magic_large_postshift:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl $-1073741824 # imm = 0xC0000000
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll __umoddi3
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: urem_i64_magic_large_postshift:
+; X64: # %bb.0:
+; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: shrq %rdx
+; X64-NEXT: movabsq $4611686018427387904, %rax # imm = 0x4000000000000000
+; X64-NEXT: andq %rdx, %rax
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: retq
+ %ret = urem i64 %x, 13835058055282163712 ; = 3 * 2^62
+ ret i64 %ret
+}
+
+; PR137514
+define i64 @udiv_i64_magic_large_preshift(i64 %x) nounwind {
+; X86-LABEL: udiv_i64_magic_large_preshift:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl $14
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll __udivdi3
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i64_magic_large_preshift:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq $33, %rax
+; X64-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: retq
+ %ret = udiv i64 %x, 60129542144 ; = 14 * 2^32
+ ret i64 %ret
+}
+
+; PR137514
+define i64 @urem_i64_magic_large_preshift(i64 %x) nounwind {
+; X86-LABEL: urem_i64_magic_large_preshift:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl $14
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll __umoddi3
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: urem_i64_magic_large_preshift:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq $33, %rax
+; X64-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movabsq $60129542144, %rax # imm = 0xE00000000
+; X64-NEXT: imulq %rdx, %rax
+; X64-NEXT: subq %rax, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %ret = urem i64 %x, 60129542144 ; = 14 * 2^32
+ ret i64 %ret
+}
+
+; PR137514
+define i64 @udiv_i64_magic_is_add(i64 %x) nounwind {
+; X86-LABEL: udiv_i64_magic_is_add:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl $196608 # imm = 0x30000
+; X86-NEXT: pushl $-1
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll __udivdi3
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i64_magic_is_add:
+; X64: # %bb.0:
+; X64-NEXT: movabsq $6148789591883185367, %rcx # imm = 0x5554E38E5ED0FCD7
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: shrq %rdi
+; X64-NEXT: leaq (%rdi,%rdx), %rax
+; X64-NEXT: shrq $49, %rax
+; X64-NEXT: retq
+ %ret = udiv i64 %x, 844429225099263 ; = 3 * 2^48 + 2^32 - 1
+ ret i64 %ret
+}
+
+; PR137514
+define i64 @urem_i64_magic_is_add(i64 %x) nounwind {
+; X86-LABEL: urem_i64_magic_is_add:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl $196608 # imm = 0x30000
+; X86-NEXT: pushl $-1
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll __umoddi3
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: urem_i64_magic_is_add:
+; X64: # %bb.0:
+; X64-NEXT: movabsq $6148789591883185367, %rcx # imm = 0x5554E38E5ED0FCD7
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq %rdx, %rax
+; X64-NEXT: shrq %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: shrq $49, %rax
+; X64-NEXT: movabsq $844429225099263, %rcx # imm = 0x30000FFFFFFFF
+; X64-NEXT: imulq %rax, %rcx
+; X64-NEXT: subq %rcx, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %ret = urem i64 %x, 844429225099263 ; = 3 * 2^48 + 2^32 - 1
+ ret i64 %ret
+}
+
; Make sure we don't inline expand for optsize.
define i64 @urem_i64_3_optsize(i64 %x) nounwind optsize {
; X86-LABEL: urem_i64_3_optsize:
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 54022d242ab50..34d29829f5b35 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -1168,3 +1168,247 @@ entry:
%rem = udiv i128 %x, 13
ret i128 %rem
}
+
+; PR137514
+define i128 @udiv_magic_preshift_and_postshift(i128 %x) nounwind {
+; X86-64-LABEL: udiv_magic_preshift_and_postshift:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movl $202, %edx
+; X86-64-NEXT: xorl %ecx, %ecx
+; X86-64-NEXT: callq __udivti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: udiv_magic_preshift_and_postshift:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $202, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __udivti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = udiv i128 %x, 202
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @urem_magic_preshift_and_postshift(i128 %x) nounwind {
+; X86-64-LABEL: urem_magic_preshift_and_postshift:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movl $202, %edx
+; X86-64-NEXT: xorl %ecx, %ecx
+; X86-64-NEXT: callq __umodti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: urem_magic_preshift_and_postshift:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $202, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __umodti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = urem i128 %x, 202
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @udiv_magic_large_preshift(i128 %x) nounwind {
+; X86-64-LABEL: udiv_magic_large_preshift:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movabsq $755914244096, %rcx # imm = 0xB000000000
+; X86-64-NEXT: xorl %edx, %edx
+; X86-64-NEXT: callq __udivti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: udiv_magic_large_preshift:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000
+; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __udivti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = udiv i128 %x, 13944156602510523416463735259136 ; = 11 * 2^100
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @urem_magic_large_preshift(i128 %x) nounwind {
+; X86-64-LABEL: urem_magic_large_preshift:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movabsq $755914244096, %rcx # imm = 0xB000000000
+; X86-64-NEXT: xorl %edx, %edx
+; X86-64-NEXT: callq __umodti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: urem_magic_large_preshift:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000
+; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __umodti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = urem i128 %x, 13944156602510523416463735259136 ; = 11 * 2^100
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @udiv_magic_large_postshift(i128 %x) nounwind {
+; X86-64-LABEL: udiv_magic_large_postshift:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movl $1, %edx
+; X86-64-NEXT: movl $1, %ecx
+; X86-64-NEXT: callq __udivti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: udiv_magic_large_postshift:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __udivti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = udiv i128 %x, 18446744073709551617 ; = 2^64 + 1
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @urem_magic_large_postshift(i128 %x) nounwind {
+; X86-64-LABEL: urem_magic_large_postshift:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movl $1, %edx
+; X86-64-NEXT: movl $1, %ecx
+; X86-64-NEXT: callq __umodti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: urem_magic_large_postshift:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __umodti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = urem i128 %x, 18446744073709551617 ; = 2^64 + 1
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @udiv_magic_is_add(i128 %x) nounwind {
+; X86-64-LABEL: udiv_magic_is_add:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X86-64-NEXT: movl $1, %edx
+; X86-64-NEXT: callq __udivti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: udiv_magic_is_add:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __udivti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = udiv i128 %x, 170141183460469231731687303715884105729 ; = 2^127 + 1
+ ret i128 %ret
+}
+
+; PR137514
+define i128 @urem_magic_is_add(i128 %x) nounwind {
+; X86-64-LABEL: urem_magic_is_add:
+; X86-64: # %bb.0:
+; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X86-64-NEXT: movl $1, %edx
+; X86-64-NEXT: callq __umodti3 at PLT
+; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: retq
+;
+; WIN64-LABEL: urem_magic_is_add:
+; WIN64: # %bb.0:
+; WIN64-NEXT: subq $72, %rsp
+; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT: callq __umodti3
+; WIN64-NEXT: movq %xmm0, %rax
+; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT: movq %xmm0, %rdx
+; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: retq
+ %ret = urem i128 %x, 170141183460469231731687303715884105729 ; = 2^127 + 1
+ ret i128 %ret
+}
>From ee98d97b7c9d0a74736d57083498237a4adcf89d Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Tue, 19 Aug 2025 05:39:58 +0200
Subject: [PATCH 2/4] [SelectionDAG] Adjust Existing Tests; NFC
Add new test prefixes to some tests. Currently, these prefixes are
unused but a subsequent commit will change the test result such that
they become necessary.
Furthermore, rename tests that will be folded after a subsequent commit.
---
llvm/test/CodeGen/RISCV/urem-lkk.ll | 11 +++++------
llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 11 +++++------
2 files changed, 10 insertions(+), 12 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index d73930dfb0ffa..27868812cde8e 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -214,9 +214,8 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) nounwind {
ret i32 %1
}
-; Don't fold i64 urem
-define i64 @dont_fold_urem_i64(i64 %x) nounwind {
-; RV32I-LABEL: dont_fold_urem_i64:
+define i64 @fold_urem_i64(i64 %x) nounwind {
+; RV32I-LABEL: fold_urem_i64:
; RV32I: # %bb.0:
; RV32I-NEXT: addi sp, sp, -16
; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
@@ -227,7 +226,7 @@ define i64 @dont_fold_urem_i64(i64 %x) nounwind {
; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
-; RV32IM-LABEL: dont_fold_urem_i64:
+; RV32IM-LABEL: fold_urem_i64:
; RV32IM: # %bb.0:
; RV32IM-NEXT: slli a2, a1, 10
; RV32IM-NEXT: srli a3, a0, 22
@@ -252,12 +251,12 @@ define i64 @dont_fold_urem_i64(i64 %x) nounwind {
; RV32IM-NEXT: li a1, 0
; RV32IM-NEXT: ret
;
-; RV64I-LABEL: dont_fold_urem_i64:
+; RV64I-LABEL: fold_urem_i64:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, 98
; RV64I-NEXT: tail __umoddi3
;
-; RV64IM-LABEL: dont_fold_urem_i64:
+; RV64IM-LABEL: fold_urem_i64:
; RV64IM: # %bb.0:
; RV64IM-NEXT: lui a1, %hi(.LCPI6_0)
; RV64IM-NEXT: ld a1, %lo(.LCPI6_0)(a1)
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 180fa6fd2b2f6..ce3e2c1a4f7be 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -778,9 +778,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
ret <4 x i16> %1
}
-; Don't fold i64 urem.
-define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
-; RV32I-LABEL: dont_fold_urem_i64:
+define <4 x i64> @fold_urem_i64(<4 x i64> %x) nounwind {
+; RV32I-LABEL: fold_urem_i64:
; RV32I: # %bb.0:
; RV32I-NEXT: addi sp, sp, -48
; RV32I-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
@@ -850,7 +849,7 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV32I-NEXT: addi sp, sp, 48
; RV32I-NEXT: ret
;
-; RV32IM-LABEL: dont_fold_urem_i64:
+; RV32IM-LABEL: fold_urem_i64:
; RV32IM: # %bb.0:
; RV32IM-NEXT: addi sp, sp, -48
; RV32IM-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
@@ -926,7 +925,7 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV32IM-NEXT: addi sp, sp, 48
; RV32IM-NEXT: ret
;
-; RV64I-LABEL: dont_fold_urem_i64:
+; RV64I-LABEL: fold_urem_i64:
; RV64I: # %bb.0:
; RV64I-NEXT: addi sp, sp, -48
; RV64I-NEXT: sd ra, 40(sp) # 8-byte Folded Spill
@@ -962,7 +961,7 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV64I-NEXT: addi sp, sp, 48
; RV64I-NEXT: ret
;
-; RV64IM-LABEL: dont_fold_urem_i64:
+; RV64IM-LABEL: fold_urem_i64:
; RV64IM: # %bb.0:
; RV64IM-NEXT: ld a2, 8(a1)
; RV64IM-NEXT: ld a3, 16(a1)
>From 336175451ea76d2bd42ea23511ca329ebbec61bb Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Tue, 15 Jul 2025 16:21:25 +0200
Subject: [PATCH 3/4] [SelectionDAG] Move UREM Decomposition to Own Function;
NFC
---
llvm/include/llvm/CodeGen/TargetLowering.h | 4 +
.../CodeGen/SelectionDAG/TargetLowering.cpp | 73 +++++++++++--------
2 files changed, 47 insertions(+), 30 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a3583013de52d..bd05f747b70e7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5966,6 +5966,10 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
SDValue buildSREMEqFold(EVT SETCCVT, SDValue REMNode, SDValue CompTargetNode,
ISD::CondCode Cond, DAGCombinerInfo &DCI,
const SDLoc &DL) const;
+
+ bool expandUDIVREMByConstantViaUREMDecomposition(
+ SDNode *N, APInt Divisor, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
+ SelectionDAG &DAG, SDValue LL, SDValue LH) const;
};
/// Given an LLVM IR type and return type attributes, compute the return value
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index da0c427911d88..2e86451a88440 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8136,25 +8136,12 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
// dividend and multiply by the multiplicative inverse of the shifted divisor.
// If we want the remainder, we shift the value left by the number of trailing
// zeros and add the bits that were shifted out of the dividend.
-bool TargetLowering::expandDIVREMByConstant(SDNode *N,
- SmallVectorImpl<SDValue> &Result,
- EVT HiLoVT, SelectionDAG &DAG,
- SDValue LL, SDValue LH) const {
+bool TargetLowering::expandUDIVREMByConstantViaUREMDecomposition(
+ SDNode *N, APInt Divisor, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
+ SelectionDAG &DAG, SDValue LL, SDValue LH) const {
unsigned Opcode = N->getOpcode();
EVT VT = N->getValueType(0);
- // TODO: Support signed division/remainder.
- if (Opcode == ISD::SREM || Opcode == ISD::SDIV || Opcode == ISD::SDIVREM)
- return false;
- assert(
- (Opcode == ISD::UREM || Opcode == ISD::UDIV || Opcode == ISD::UDIVREM) &&
- "Unexpected opcode");
-
- auto *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!CN)
- return false;
-
- APInt Divisor = CN->getAPIntValue();
unsigned BitWidth = Divisor.getBitWidth();
unsigned HBitWidth = BitWidth / 2;
assert(VT.getScalarSizeInBits() == BitWidth &&
@@ -8165,20 +8152,6 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
if (Divisor.uge(HalfMaxPlus1))
return false;
- // We depend on the UREM by constant optimization in DAGCombiner that requires
- // high multiply.
- if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) &&
- !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT))
- return false;
-
- // Don't expand if optimizing for size.
- if (DAG.shouldOptForSize())
- return false;
-
- // Early out for 0 or 1 divisors.
- if (Divisor.ule(1))
- return false;
-
// If the divisor is even, shift it until it becomes odd.
unsigned TrailingZeros = 0;
if (!Divisor[0]) {
@@ -8352,6 +8325,46 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
return true;
}
+bool TargetLowering::expandDIVREMByConstant(SDNode *N,
+ SmallVectorImpl<SDValue> &Result,
+ EVT HiLoVT, SelectionDAG &DAG,
+ SDValue LL, SDValue LH) const {
+ unsigned Opcode = N->getOpcode();
+
+ // TODO: Support signed division/remainder.
+ if (Opcode == ISD::SREM || Opcode == ISD::SDIV || Opcode == ISD::SDIVREM)
+ return false;
+ assert(
+ (Opcode == ISD::UREM || Opcode == ISD::UDIV || Opcode == ISD::UDIVREM) &&
+ "Unexpected opcode");
+
+ auto *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!CN)
+ return false;
+
+ APInt Divisor = CN->getAPIntValue();
+
+ // We depend on the UREM by constant optimization in DAGCombiner that requires
+ // high multiply.
+ if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) &&
+ !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT))
+ return false;
+
+ // Don't expand if optimizing for size.
+ if (DAG.shouldOptForSize())
+ return false;
+
+ // Early out for 0 or 1 divisors.
+ if (Divisor.ule(1))
+ return false;
+
+ if (expandUDIVREMByConstantViaUREMDecomposition(N, Divisor, Result, HiLoVT,
+ DAG, LL, LH))
+ return true;
+
+ return false;
+}
+
// Check that (every element of) Z is undef or not an exact multiple of BW.
static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) {
return ISD::matchUnaryPredicate(
>From 319bafce97ac8c54c00e66b6984aa8fde99197ca Mon Sep 17 00:00:00 2001
From: Marius Kamp <msk at posteo.org>
Date: Tue, 12 Aug 2025 16:49:49 +0200
Subject: [PATCH 4/4] [SelectionDAG] Use Magic Algorithm for Splitting
UDIV/UREM by Constant
For integer types twice as large as a legal type, we have previously
generated a library call if another splitting technique was not
applicable.
With this change, we use an adaption of the Magic algorithm. This
algorithm is also used for UDIV/UREM by constants on legal types. The
implementation introduced here is a simple port of the already existing
implementation to types twice the size of a legal type. The core idea of
this algorithm is to replace (udiv x c) for a constant c with the bits
higher or equal to the s-th bit of the multiplication of x by (2^s + o)/c
for some s and o. More details are available in Henry S. Warren, Jr.:
"Hacker's Delight", chapter 10.
An efficient handling of UDIV/UREM by constants on types twice as large
as a legal type is mostly relevant for 32-bit platforms. But some
projects may also benefit on 64-bit platforms. For example, the `fmt`
library for C++ uses 128-bit unsigned divisions by 100 and 10000, which
have not been covered by the previously existing optimizations.
Closes #137514.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 5 +
.../CodeGen/SelectionDAG/TargetLowering.cpp | 112 ++++
llvm/test/CodeGen/ARM/funnel-shift.ll | 240 ++++----
llvm/test/CodeGen/Mips/funnel-shift.ll | 373 ++++++------
llvm/test/CodeGen/PowerPC/funnel-shift.ll | 312 +++++------
llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 194 ++++---
llvm/test/CodeGen/X86/divide-by-constant.ll | 372 +++++++++---
llvm/test/CodeGen/X86/divmod128.ll | 529 +++++++++++++-----
llvm/test/CodeGen/X86/funnel-shift.ll | 112 ++--
llvm/test/CodeGen/X86/i128-udiv.ll | 77 ++-
10 files changed, 1527 insertions(+), 799 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index bd05f747b70e7..b88967fc07df8 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5970,6 +5970,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
bool expandUDIVREMByConstantViaUREMDecomposition(
SDNode *N, APInt Divisor, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
SelectionDAG &DAG, SDValue LL, SDValue LH) const;
+
+ bool expandUDIVREMByConstantViaUMulHiMagic(SDNode *N, const APInt &Divisor,
+ SmallVectorImpl<SDValue> &Result,
+ EVT HiLoVT, SelectionDAG &DAG,
+ SDValue LL, SDValue LH) const;
};
/// Given an LLVM IR type and return type attributes, compute the return value
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 2e86451a88440..5b1c6664174a9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8325,6 +8325,114 @@ bool TargetLowering::expandUDIVREMByConstantViaUREMDecomposition(
return true;
}
+bool TargetLowering::expandUDIVREMByConstantViaUMulHiMagic(
+ SDNode *N, const APInt &Divisor, SmallVectorImpl<SDValue> &Result,
+ EVT HiLoVT, SelectionDAG &DAG, SDValue LL, SDValue LH) const {
+
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N0->getValueType(0);
+ SDLoc DL{N};
+
+ assert(!Divisor.isOne() && "Magic algorithm does not work for division by 1");
+
+ // This helper creates a MUL_LOHI of the pair (LL, LH) by a constant.
+ auto MakeMUL_LOHIByConst = [&](unsigned Opc, SDValue LL, SDValue LH,
+ const APInt &Const,
+ SmallVectorImpl<SDValue> &Result) {
+ SDValue LHS = DAG.getNode(ISD::BUILD_PAIR, DL, VT, LL, LH);
+ SDValue RHS = DAG.getConstant(Const, DL, VT);
+ auto [RL, RH] = DAG.SplitScalar(RHS, DL, HiLoVT, HiLoVT);
+ return expandMUL_LOHI(Opc, VT, DL, LHS, RHS, Result, HiLoVT, DAG,
+ TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
+ LL, LH, RL, RH);
+ };
+
+ // This helper creates an ADD/SUB of the pairs (LL, LH) and (RL, RH).
+ auto MakeAddSubLong = [&](unsigned Opc, SDValue LL, SDValue LH, SDValue RL,
+ SDValue RH) {
+ SDValue AddSubNode =
+ DAG.getNode(Opc == ISD::ADD ? ISD::UADDO : ISD::USUBO, DL,
+ DAG.getVTList(HiLoVT, MVT::i1), LL, RL);
+ SDValue OutL = AddSubNode.getValue(0);
+ SDValue Overflow = AddSubNode.getValue(1);
+ SDValue AddSubWithOverflow =
+ DAG.getNode(Opc == ISD::ADD ? ISD::UADDO_CARRY : ISD::USUBO_CARRY, DL,
+ DAG.getVTList(HiLoVT, MVT::i1), LH, RH, Overflow);
+ SDValue OutH = AddSubWithOverflow.getValue(0);
+ return std::make_pair(OutL, OutH);
+ };
+
+ // This helper creates a SRL of the pair (LL, LH) by Shift.
+ auto MakeSRLLong = [&](SDValue LL, SDValue LH, unsigned Shift) {
+ unsigned HBitWidth = HiLoVT.getScalarSizeInBits();
+ if (Shift < HBitWidth) {
+ SDValue ShAmt = DAG.getShiftAmountConstant(Shift, HiLoVT, DL);
+ SDValue ResL = DAG.getNode(ISD::FSHR, DL, HiLoVT, LH, LL, ShAmt);
+ SDValue ResH = DAG.getNode(ISD::SRL, DL, HiLoVT, LH, ShAmt);
+ return std::make_pair(ResL, ResH);
+ }
+ SDValue Zero = DAG.getConstant(0, DL, HiLoVT);
+ if (Shift == HBitWidth)
+ return std::make_pair(LH, Zero);
+ assert(Shift - HBitWidth < HBitWidth &&
+ "We shouldn't generate an undefined shift");
+ SDValue ShAmt = DAG.getShiftAmountConstant(Shift - HBitWidth, HiLoVT, DL);
+ return std::make_pair(DAG.getNode(ISD::SRL, DL, HiLoVT, LH, ShAmt), Zero);
+ };
+
+ // Knowledge of leading zeros may help to reduce the multiplier.
+ unsigned KnownLeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros();
+
+ UnsignedDivisionByConstantInfo Magics = UnsignedDivisionByConstantInfo::get(
+ Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero()));
+
+ assert(!LL == !LH && "Expected both input halves or no input halves!");
+ if (!LL)
+ std::tie(LL, LH) = DAG.SplitScalar(N0, DL, HiLoVT, HiLoVT);
+ SDValue QL = LL;
+ SDValue QH = LH;
+ if (Magics.PreShift != 0)
+ std::tie(QL, QH) = MakeSRLLong(QL, QH, Magics.PreShift);
+
+ SmallVector<SDValue, 2> UMulResult;
+ if (!MakeMUL_LOHIByConst(ISD::UMUL_LOHI, QL, QH, Magics.Magic, UMulResult))
+ return false;
+
+ QL = UMulResult[2];
+ QH = UMulResult[3];
+
+ if (Magics.IsAdd) {
+ auto [NPQL, NPQH] = MakeAddSubLong(ISD::SUB, LL, LH, QL, QH);
+ std::tie(NPQL, NPQH) = MakeSRLLong(NPQL, NPQH, 1);
+ std::tie(QL, QH) = MakeAddSubLong(ISD::ADD, NPQL, NPQH, QL, QH);
+ }
+
+ if (Magics.PostShift != 0)
+ std::tie(QL, QH) = MakeSRLLong(QL, QH, Magics.PostShift);
+
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != ISD::UREM) {
+ Result.push_back(QL);
+ Result.push_back(QH);
+ }
+
+ if (Opcode != ISD::UDIV) {
+ SmallVector<SDValue, 2> MulResult;
+ if (!MakeMUL_LOHIByConst(ISD::MUL, QL, QH, Divisor, MulResult))
+ return false;
+
+ assert(MulResult.size() == 2);
+
+ auto [RemL, RemH] =
+ MakeAddSubLong(ISD::SUB, LL, LH, MulResult[0], MulResult[1]);
+
+ Result.push_back(RemL);
+ Result.push_back(RemH);
+ }
+
+ return true;
+}
+
bool TargetLowering::expandDIVREMByConstant(SDNode *N,
SmallVectorImpl<SDValue> &Result,
EVT HiLoVT, SelectionDAG &DAG,
@@ -8362,6 +8470,10 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
DAG, LL, LH))
return true;
+ if (expandUDIVREMByConstantViaUMulHiMagic(N, Divisor, Result, HiLoVT, DAG, LL,
+ LH))
+ return true;
+
return false;
}
diff --git a/llvm/test/CodeGen/ARM/funnel-shift.ll b/llvm/test/CodeGen/ARM/funnel-shift.ll
index 191155ae30f3e..77bed94918f2a 100644
--- a/llvm/test/CodeGen/ARM/funnel-shift.ll
+++ b/llvm/test/CodeGen/ARM/funnel-shift.ll
@@ -47,67 +47,77 @@ declare i37 @llvm.fshl.i37(i37, i37, i37)
define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
; SCALAR-LABEL: fshl_i37:
; SCALAR: @ %bb.0:
-; SCALAR-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; SCALAR-NEXT: push {r4, r5, r6, r7, r8, lr}
-; SCALAR-NEXT: mov r8, r0
-; SCALAR-NEXT: ldr r0, [sp, #28]
-; SCALAR-NEXT: mov r4, r1
-; SCALAR-NEXT: mov r5, r3
-; SCALAR-NEXT: and r1, r0, #31
-; SCALAR-NEXT: ldr r0, [sp, #24]
-; SCALAR-NEXT: mov r6, r2
-; SCALAR-NEXT: mov r2, #37
-; SCALAR-NEXT: mov r3, #0
-; SCALAR-NEXT: bl __aeabi_uldivmod
-; SCALAR-NEXT: lsl r0, r5, #27
-; SCALAR-NEXT: tst r2, #32
-; SCALAR-NEXT: orr r0, r0, r6, lsr #5
-; SCALAR-NEXT: mov r1, r8
-; SCALAR-NEXT: and r3, r2, #31
-; SCALAR-NEXT: mov r7, #31
+; SCALAR-NEXT: .save {r4, r5, r6, r7, r11, lr}
+; SCALAR-NEXT: push {r4, r5, r6, r7, r11, lr}
+; SCALAR-NEXT: ldr lr, [sp, #24]
+; SCALAR-NEXT: movw r12, #46053
+; SCALAR-NEXT: movt r12, #12398
+; SCALAR-NEXT: movw r6, #15941
+; SCALAR-NEXT: ldr r7, [sp, #28]
+; SCALAR-NEXT: movt r6, #1771
+; SCALAR-NEXT: umull r4, r5, lr, r12
+; SCALAR-NEXT: lsl r3, r3, #27
+; SCALAR-NEXT: mov r4, #0
+; SCALAR-NEXT: and r7, r7, #31
+; SCALAR-NEXT: umlal r5, r4, lr, r6
+; SCALAR-NEXT: orr r3, r3, r2, lsr #5
+; SCALAR-NEXT: umlal r5, r4, r7, r12
+; SCALAR-NEXT: mla r7, r7, r6, r4
+; SCALAR-NEXT: mov r6, #37
+; SCALAR-NEXT: mls r7, r7, r6, lr
+; SCALAR-NEXT: mov r6, r0
+; SCALAR-NEXT: tst r7, #32
+; SCALAR-NEXT: and r5, r7, #31
+; SCALAR-NEXT: movne r6, r3
+; SCALAR-NEXT: lslne r3, r2, #27
+; SCALAR-NEXT: lsr r2, r3, #1
+; SCALAR-NEXT: mov r3, #31
+; SCALAR-NEXT: bic r3, r3, r7
; SCALAR-NEXT: movne r1, r0
-; SCALAR-NEXT: lslne r0, r6, #27
-; SCALAR-NEXT: bic r2, r7, r2
-; SCALAR-NEXT: lsl r5, r1, r3
-; SCALAR-NEXT: lsr r0, r0, #1
-; SCALAR-NEXT: movne r4, r8
-; SCALAR-NEXT: lsr r1, r1, #1
-; SCALAR-NEXT: lsl r3, r4, r3
-; SCALAR-NEXT: orr r0, r5, r0, lsr r2
-; SCALAR-NEXT: orr r1, r3, r1, lsr r2
-; SCALAR-NEXT: pop {r4, r5, r6, r7, r8, pc}
+; SCALAR-NEXT: lsl r4, r6, r5
+; SCALAR-NEXT: lsl r0, r1, r5
+; SCALAR-NEXT: lsr r1, r6, #1
+; SCALAR-NEXT: orr r2, r4, r2, lsr r3
+; SCALAR-NEXT: orr r1, r0, r1, lsr r3
+; SCALAR-NEXT: mov r0, r2
+; SCALAR-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; NEON-LABEL: fshl_i37:
; NEON: @ %bb.0:
-; NEON-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; NEON-NEXT: push {r4, r5, r6, r7, r8, lr}
-; NEON-NEXT: mov r4, r1
-; NEON-NEXT: ldr r1, [sp, #28]
-; NEON-NEXT: mov r8, r0
-; NEON-NEXT: ldr r0, [sp, #24]
-; NEON-NEXT: and r1, r1, #31
-; NEON-NEXT: mov r5, r3
-; NEON-NEXT: mov r6, r2
-; NEON-NEXT: mov r2, #37
-; NEON-NEXT: mov r3, #0
-; NEON-NEXT: bl __aeabi_uldivmod
-; NEON-NEXT: lsl r0, r5, #27
-; NEON-NEXT: tst r2, #32
-; NEON-NEXT: orr r0, r0, r6, lsr #5
-; NEON-NEXT: mov r1, r8
-; NEON-NEXT: and r3, r2, #31
-; NEON-NEXT: mov r7, #31
+; NEON-NEXT: .save {r4, r5, r6, r7, r11, lr}
+; NEON-NEXT: push {r4, r5, r6, r7, r11, lr}
+; NEON-NEXT: ldr r12, [sp, #24]
+; NEON-NEXT: movw lr, #46053
+; NEON-NEXT: movt lr, #12398
+; NEON-NEXT: ldr r6, [sp, #28]
+; NEON-NEXT: mov r7, #0
+; NEON-NEXT: lsl r3, r3, #27
+; NEON-NEXT: umull r4, r5, r12, lr
+; NEON-NEXT: and r6, r6, #31
+; NEON-NEXT: movw r4, #15941
+; NEON-NEXT: movt r4, #1771
+; NEON-NEXT: umlal r5, r7, r12, r4
+; NEON-NEXT: orr r3, r3, r2, lsr #5
+; NEON-NEXT: umlal r5, r7, r6, lr
+; NEON-NEXT: mla r7, r6, r4, r7
+; NEON-NEXT: mov r6, #37
+; NEON-NEXT: mls r7, r7, r6, r12
+; NEON-NEXT: mov r6, r0
+; NEON-NEXT: tst r7, #32
+; NEON-NEXT: and r5, r7, #31
+; NEON-NEXT: movne r6, r3
+; NEON-NEXT: lslne r3, r2, #27
+; NEON-NEXT: lsr r2, r3, #1
+; NEON-NEXT: mov r3, #31
+; NEON-NEXT: bic r3, r3, r7
; NEON-NEXT: movne r1, r0
-; NEON-NEXT: lslne r0, r6, #27
-; NEON-NEXT: bic r2, r7, r2
-; NEON-NEXT: lsl r5, r1, r3
-; NEON-NEXT: lsr r0, r0, #1
-; NEON-NEXT: movne r4, r8
-; NEON-NEXT: lsr r1, r1, #1
-; NEON-NEXT: lsl r3, r4, r3
-; NEON-NEXT: orr r0, r5, r0, lsr r2
-; NEON-NEXT: orr r1, r3, r1, lsr r2
-; NEON-NEXT: pop {r4, r5, r6, r7, r8, pc}
+; NEON-NEXT: lsl r4, r6, r5
+; NEON-NEXT: lsl r0, r1, r5
+; NEON-NEXT: lsr r1, r6, #1
+; NEON-NEXT: orr r2, r4, r2, lsr r3
+; NEON-NEXT: orr r1, r0, r1, lsr r3
+; NEON-NEXT: mov r0, r2
+; NEON-NEXT: pop {r4, r5, r6, r7, r11, pc}
%f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
}
@@ -237,66 +247,76 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
; SCALAR: @ %bb.0:
; SCALAR-NEXT: .save {r4, r5, r6, r7, r11, lr}
; SCALAR-NEXT: push {r4, r5, r6, r7, r11, lr}
-; SCALAR-NEXT: mov r5, r0
-; SCALAR-NEXT: ldr r0, [sp, #28]
-; SCALAR-NEXT: mov r4, r1
-; SCALAR-NEXT: mov r6, r3
-; SCALAR-NEXT: and r1, r0, #31
-; SCALAR-NEXT: ldr r0, [sp, #24]
-; SCALAR-NEXT: mov r7, r2
-; SCALAR-NEXT: mov r2, #37
-; SCALAR-NEXT: mov r3, #0
-; SCALAR-NEXT: bl __aeabi_uldivmod
-; SCALAR-NEXT: add r0, r2, #27
-; SCALAR-NEXT: lsl r2, r6, #27
-; SCALAR-NEXT: orr r2, r2, r7, lsr #5
-; SCALAR-NEXT: mov r1, #31
-; SCALAR-NEXT: tst r0, #32
-; SCALAR-NEXT: mov r3, r5
-; SCALAR-NEXT: moveq r3, r2
-; SCALAR-NEXT: lsleq r2, r7, #27
-; SCALAR-NEXT: bic r1, r1, r0
-; SCALAR-NEXT: and r7, r0, #31
-; SCALAR-NEXT: lsl r6, r3, #1
-; SCALAR-NEXT: moveq r4, r5
-; SCALAR-NEXT: lsl r6, r6, r1
-; SCALAR-NEXT: orr r0, r6, r2, lsr r7
-; SCALAR-NEXT: lsl r2, r4, #1
-; SCALAR-NEXT: lsl r1, r2, r1
-; SCALAR-NEXT: orr r1, r1, r3, lsr r7
+; SCALAR-NEXT: ldr lr, [sp, #24]
+; SCALAR-NEXT: movw r12, #46053
+; SCALAR-NEXT: movt r12, #12398
+; SCALAR-NEXT: movw r6, #15941
+; SCALAR-NEXT: ldr r7, [sp, #28]
+; SCALAR-NEXT: movt r6, #1771
+; SCALAR-NEXT: umull r4, r5, lr, r12
+; SCALAR-NEXT: lsl r3, r3, #27
+; SCALAR-NEXT: mov r4, #0
+; SCALAR-NEXT: and r7, r7, #31
+; SCALAR-NEXT: umlal r5, r4, lr, r6
+; SCALAR-NEXT: orr r3, r3, r2, lsr #5
+; SCALAR-NEXT: umlal r5, r4, r7, r12
+; SCALAR-NEXT: mov r5, #31
+; SCALAR-NEXT: mla r7, r7, r6, r4
+; SCALAR-NEXT: mov r6, #37
+; SCALAR-NEXT: mls r7, r7, r6, lr
+; SCALAR-NEXT: mov r6, r0
+; SCALAR-NEXT: add r7, r7, #27
+; SCALAR-NEXT: tst r7, #32
+; SCALAR-NEXT: bic r5, r5, r7
+; SCALAR-NEXT: moveq r6, r3
+; SCALAR-NEXT: lsleq r3, r2, #27
+; SCALAR-NEXT: lsl r2, r6, #1
+; SCALAR-NEXT: and r7, r7, #31
+; SCALAR-NEXT: lsl r2, r2, r5
+; SCALAR-NEXT: moveq r1, r0
+; SCALAR-NEXT: lsl r0, r1, #1
+; SCALAR-NEXT: orr r2, r2, r3, lsr r7
+; SCALAR-NEXT: lsl r0, r0, r5
+; SCALAR-NEXT: orr r1, r0, r6, lsr r7
+; SCALAR-NEXT: mov r0, r2
; SCALAR-NEXT: pop {r4, r5, r6, r7, r11, pc}
;
; NEON-LABEL: fshr_i37:
; NEON: @ %bb.0:
; NEON-NEXT: .save {r4, r5, r6, r7, r11, lr}
; NEON-NEXT: push {r4, r5, r6, r7, r11, lr}
-; NEON-NEXT: mov r4, r1
-; NEON-NEXT: ldr r1, [sp, #28]
-; NEON-NEXT: mov r5, r0
-; NEON-NEXT: ldr r0, [sp, #24]
-; NEON-NEXT: and r1, r1, #31
-; NEON-NEXT: mov r6, r3
-; NEON-NEXT: mov r7, r2
-; NEON-NEXT: mov r2, #37
-; NEON-NEXT: mov r3, #0
-; NEON-NEXT: bl __aeabi_uldivmod
-; NEON-NEXT: add r0, r2, #27
-; NEON-NEXT: lsl r2, r6, #27
-; NEON-NEXT: orr r2, r2, r7, lsr #5
-; NEON-NEXT: mov r1, #31
-; NEON-NEXT: tst r0, #32
-; NEON-NEXT: mov r3, r5
-; NEON-NEXT: moveq r3, r2
-; NEON-NEXT: lsleq r2, r7, #27
-; NEON-NEXT: bic r1, r1, r0
-; NEON-NEXT: and r7, r0, #31
-; NEON-NEXT: lsl r6, r3, #1
-; NEON-NEXT: moveq r4, r5
-; NEON-NEXT: lsl r6, r6, r1
-; NEON-NEXT: orr r0, r6, r2, lsr r7
-; NEON-NEXT: lsl r2, r4, #1
-; NEON-NEXT: lsl r1, r2, r1
-; NEON-NEXT: orr r1, r1, r3, lsr r7
+; NEON-NEXT: ldr r12, [sp, #24]
+; NEON-NEXT: movw lr, #46053
+; NEON-NEXT: movt lr, #12398
+; NEON-NEXT: ldr r6, [sp, #28]
+; NEON-NEXT: mov r7, #0
+; NEON-NEXT: lsl r3, r3, #27
+; NEON-NEXT: umull r4, r5, r12, lr
+; NEON-NEXT: and r6, r6, #31
+; NEON-NEXT: movw r4, #15941
+; NEON-NEXT: movt r4, #1771
+; NEON-NEXT: umlal r5, r7, r12, r4
+; NEON-NEXT: orr r3, r3, r2, lsr #5
+; NEON-NEXT: umlal r5, r7, r6, lr
+; NEON-NEXT: mov r5, #31
+; NEON-NEXT: mla r7, r6, r4, r7
+; NEON-NEXT: mov r6, #37
+; NEON-NEXT: mls r7, r7, r6, r12
+; NEON-NEXT: mov r6, r0
+; NEON-NEXT: add r7, r7, #27
+; NEON-NEXT: tst r7, #32
+; NEON-NEXT: bic r5, r5, r7
+; NEON-NEXT: moveq r6, r3
+; NEON-NEXT: lsleq r3, r2, #27
+; NEON-NEXT: lsl r2, r6, #1
+; NEON-NEXT: and r7, r7, #31
+; NEON-NEXT: lsl r2, r2, r5
+; NEON-NEXT: moveq r1, r0
+; NEON-NEXT: lsl r0, r1, #1
+; NEON-NEXT: orr r2, r2, r3, lsr r7
+; NEON-NEXT: lsl r0, r0, r5
+; NEON-NEXT: orr r1, r0, r6, lsr r7
+; NEON-NEXT: mov r0, r2
; NEON-NEXT: pop {r4, r5, r6, r7, r11, pc}
%f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
diff --git a/llvm/test/CodeGen/Mips/funnel-shift.ll b/llvm/test/CodeGen/Mips/funnel-shift.ll
index 99e0d47441a02..51e212c8c5ae0 100644
--- a/llvm/test/CodeGen/Mips/funnel-shift.ll
+++ b/llvm/test/CodeGen/Mips/funnel-shift.ll
@@ -48,105 +48,106 @@ declare i37 @llvm.fshl.i37(i37, i37, i37)
define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-BE-LABEL: fshl_i37:
; CHECK-BE: # %bb.0:
-; CHECK-BE-NEXT: addiu $sp, $sp, -40
-; CHECK-BE-NEXT: .cfi_def_cfa_offset 40
-; CHECK-BE-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $19, 32($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $18, 28($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: .cfi_offset 31, -4
-; CHECK-BE-NEXT: .cfi_offset 19, -8
-; CHECK-BE-NEXT: .cfi_offset 18, -12
-; CHECK-BE-NEXT: .cfi_offset 17, -16
-; CHECK-BE-NEXT: .cfi_offset 16, -20
-; CHECK-BE-NEXT: move $16, $7
-; CHECK-BE-NEXT: move $17, $6
-; CHECK-BE-NEXT: move $18, $5
-; CHECK-BE-NEXT: move $19, $4
-; CHECK-BE-NEXT: lw $1, 56($sp)
-; CHECK-BE-NEXT: andi $4, $1, 31
-; CHECK-BE-NEXT: lw $5, 60($sp)
-; CHECK-BE-NEXT: addiu $6, $zero, 0
-; CHECK-BE-NEXT: jal __umoddi3
-; CHECK-BE-NEXT: addiu $7, $zero, 37
-; CHECK-BE-NEXT: srl $1, $3, 5
-; CHECK-BE-NEXT: andi $1, $1, 1
-; CHECK-BE-NEXT: movn $19, $18, $1
-; CHECK-BE-NEXT: sllv $2, $19, $3
-; CHECK-BE-NEXT: not $4, $3
-; CHECK-BE-NEXT: srl $5, $16, 5
-; CHECK-BE-NEXT: sll $6, $17, 27
-; CHECK-BE-NEXT: or $5, $6, $5
-; CHECK-BE-NEXT: movn $18, $5, $1
-; CHECK-BE-NEXT: srl $6, $18, 1
-; CHECK-BE-NEXT: srlv $6, $6, $4
-; CHECK-BE-NEXT: or $2, $2, $6
-; CHECK-BE-NEXT: sllv $3, $18, $3
-; CHECK-BE-NEXT: sll $6, $16, 27
-; CHECK-BE-NEXT: movn $5, $6, $1
-; CHECK-BE-NEXT: srl $1, $5, 1
-; CHECK-BE-NEXT: srlv $1, $1, $4
-; CHECK-BE-NEXT: or $3, $3, $1
-; CHECK-BE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $19, 32($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
+; CHECK-BE-NEXT: lui $1, 1771
+; CHECK-BE-NEXT: ori $1, $1, 15941
+; CHECK-BE-NEXT: lw $2, 20($sp)
+; CHECK-BE-NEXT: multu $2, $1
+; CHECK-BE-NEXT: mfhi $3
+; CHECK-BE-NEXT: mflo $8
+; CHECK-BE-NEXT: lui $9, 12398
+; CHECK-BE-NEXT: ori $9, $9, 46053
+; CHECK-BE-NEXT: multu $2, $9
+; CHECK-BE-NEXT: mfhi $10
+; CHECK-BE-NEXT: lw $11, 16($sp)
+; CHECK-BE-NEXT: andi $11, $11, 31
+; CHECK-BE-NEXT: multu $11, $9
+; CHECK-BE-NEXT: mflo $9
+; CHECK-BE-NEXT: mfhi $12
+; CHECK-BE-NEXT: addu $8, $10, $8
+; CHECK-BE-NEXT: sltu $10, $8, $10
+; CHECK-BE-NEXT: addu $9, $8, $9
+; CHECK-BE-NEXT: sltu $8, $9, $8
+; CHECK-BE-NEXT: addu $3, $3, $10
+; CHECK-BE-NEXT: srl $9, $7, 5
+; CHECK-BE-NEXT: sll $6, $6, 27
+; CHECK-BE-NEXT: or $6, $6, $9
+; CHECK-BE-NEXT: addu $3, $3, $12
+; CHECK-BE-NEXT: sll $7, $7, 27
+; CHECK-BE-NEXT: addu $3, $3, $8
+; CHECK-BE-NEXT: mul $1, $11, $1
+; CHECK-BE-NEXT: addu $1, $3, $1
+; CHECK-BE-NEXT: sll $3, $1, 2
+; CHECK-BE-NEXT: addu $3, $3, $1
+; CHECK-BE-NEXT: sll $1, $1, 5
+; CHECK-BE-NEXT: addu $1, $1, $3
+; CHECK-BE-NEXT: subu $1, $2, $1
+; CHECK-BE-NEXT: andi $2, $1, 32
+; CHECK-BE-NEXT: srl $3, $2, 5
+; CHECK-BE-NEXT: movn $4, $5, $3
+; CHECK-BE-NEXT: sllv $2, $4, $1
+; CHECK-BE-NEXT: not $4, $1
+; CHECK-BE-NEXT: movn $5, $6, $3
+; CHECK-BE-NEXT: srl $8, $5, 1
+; CHECK-BE-NEXT: srlv $8, $8, $4
+; CHECK-BE-NEXT: or $2, $2, $8
+; CHECK-BE-NEXT: sllv $1, $5, $1
+; CHECK-BE-NEXT: movn $6, $7, $3
+; CHECK-BE-NEXT: srl $3, $6, 1
+; CHECK-BE-NEXT: srlv $3, $3, $4
; CHECK-BE-NEXT: jr $ra
-; CHECK-BE-NEXT: addiu $sp, $sp, 40
+; CHECK-BE-NEXT: or $3, $1, $3
;
; CHECK-LE-LABEL: fshl_i37:
; CHECK-LE: # %bb.0:
-; CHECK-LE-NEXT: addiu $sp, $sp, -40
-; CHECK-LE-NEXT: .cfi_def_cfa_offset 40
-; CHECK-LE-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $19, 32($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $18, 28($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: .cfi_offset 31, -4
-; CHECK-LE-NEXT: .cfi_offset 19, -8
-; CHECK-LE-NEXT: .cfi_offset 18, -12
-; CHECK-LE-NEXT: .cfi_offset 17, -16
-; CHECK-LE-NEXT: .cfi_offset 16, -20
-; CHECK-LE-NEXT: move $16, $7
-; CHECK-LE-NEXT: move $17, $6
-; CHECK-LE-NEXT: move $18, $5
-; CHECK-LE-NEXT: move $19, $4
-; CHECK-LE-NEXT: lw $1, 60($sp)
-; CHECK-LE-NEXT: andi $5, $1, 31
-; CHECK-LE-NEXT: lw $4, 56($sp)
-; CHECK-LE-NEXT: addiu $6, $zero, 37
-; CHECK-LE-NEXT: jal __umoddi3
-; CHECK-LE-NEXT: addiu $7, $zero, 0
-; CHECK-LE-NEXT: srl $1, $2, 5
-; CHECK-LE-NEXT: andi $3, $1, 1
-; CHECK-LE-NEXT: srl $1, $17, 5
-; CHECK-LE-NEXT: sll $4, $16, 27
-; CHECK-LE-NEXT: or $1, $4, $1
-; CHECK-LE-NEXT: move $4, $19
-; CHECK-LE-NEXT: movn $4, $1, $3
-; CHECK-LE-NEXT: sllv $5, $4, $2
-; CHECK-LE-NEXT: not $6, $2
-; CHECK-LE-NEXT: sll $7, $17, 27
-; CHECK-LE-NEXT: movn $1, $7, $3
-; CHECK-LE-NEXT: srl $1, $1, 1
-; CHECK-LE-NEXT: srlv $1, $1, $6
-; CHECK-LE-NEXT: or $1, $5, $1
-; CHECK-LE-NEXT: movn $18, $19, $3
-; CHECK-LE-NEXT: sllv $2, $18, $2
-; CHECK-LE-NEXT: srl $3, $4, 1
-; CHECK-LE-NEXT: srlv $3, $3, $6
-; CHECK-LE-NEXT: or $3, $2, $3
-; CHECK-LE-NEXT: move $2, $1
-; CHECK-LE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $19, 32($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
+; CHECK-LE-NEXT: lui $1, 1771
+; CHECK-LE-NEXT: ori $1, $1, 15941
+; CHECK-LE-NEXT: lw $2, 16($sp)
+; CHECK-LE-NEXT: multu $2, $1
+; CHECK-LE-NEXT: mfhi $3
+; CHECK-LE-NEXT: mflo $8
+; CHECK-LE-NEXT: lui $9, 12398
+; CHECK-LE-NEXT: ori $9, $9, 46053
+; CHECK-LE-NEXT: multu $2, $9
+; CHECK-LE-NEXT: mfhi $10
+; CHECK-LE-NEXT: lw $11, 20($sp)
+; CHECK-LE-NEXT: andi $11, $11, 31
+; CHECK-LE-NEXT: multu $11, $9
+; CHECK-LE-NEXT: mflo $9
+; CHECK-LE-NEXT: mfhi $12
+; CHECK-LE-NEXT: addu $8, $10, $8
+; CHECK-LE-NEXT: sltu $10, $8, $10
+; CHECK-LE-NEXT: addu $9, $8, $9
+; CHECK-LE-NEXT: sltu $8, $9, $8
+; CHECK-LE-NEXT: addu $3, $3, $10
+; CHECK-LE-NEXT: srl $9, $6, 5
+; CHECK-LE-NEXT: sll $7, $7, 27
+; CHECK-LE-NEXT: or $7, $7, $9
+; CHECK-LE-NEXT: sll $6, $6, 27
+; CHECK-LE-NEXT: addu $3, $3, $12
+; CHECK-LE-NEXT: addu $3, $3, $8
+; CHECK-LE-NEXT: mul $1, $11, $1
+; CHECK-LE-NEXT: addu $1, $3, $1
+; CHECK-LE-NEXT: sll $3, $1, 2
+; CHECK-LE-NEXT: addu $3, $3, $1
+; CHECK-LE-NEXT: sll $1, $1, 5
+; CHECK-LE-NEXT: addu $1, $1, $3
+; CHECK-LE-NEXT: subu $1, $2, $1
+; CHECK-LE-NEXT: andi $2, $1, 32
+; CHECK-LE-NEXT: srl $3, $2, 5
+; CHECK-LE-NEXT: move $8, $4
+; CHECK-LE-NEXT: movn $8, $7, $3
+; CHECK-LE-NEXT: sllv $2, $8, $1
+; CHECK-LE-NEXT: not $9, $1
+; CHECK-LE-NEXT: movn $7, $6, $3
+; CHECK-LE-NEXT: srl $6, $7, 1
+; CHECK-LE-NEXT: srlv $6, $6, $9
+; CHECK-LE-NEXT: or $2, $2, $6
+; CHECK-LE-NEXT: movn $5, $4, $3
+; CHECK-LE-NEXT: sllv $1, $5, $1
+; CHECK-LE-NEXT: srl $3, $8, 1
+; CHECK-LE-NEXT: srlv $3, $3, $9
; CHECK-LE-NEXT: jr $ra
-; CHECK-LE-NEXT: addiu $sp, $sp, 40
+; CHECK-LE-NEXT: or $3, $1, $3
%f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
}
@@ -288,104 +289,106 @@ declare i37 @llvm.fshr.i37(i37, i37, i37)
define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-BE-LABEL: fshr_i37:
; CHECK-BE: # %bb.0:
-; CHECK-BE-NEXT: addiu $sp, $sp, -40
-; CHECK-BE-NEXT: .cfi_def_cfa_offset 40
-; CHECK-BE-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $19, 32($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $18, 28($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-BE-NEXT: .cfi_offset 31, -4
-; CHECK-BE-NEXT: .cfi_offset 19, -8
-; CHECK-BE-NEXT: .cfi_offset 18, -12
-; CHECK-BE-NEXT: .cfi_offset 17, -16
-; CHECK-BE-NEXT: .cfi_offset 16, -20
-; CHECK-BE-NEXT: move $16, $7
-; CHECK-BE-NEXT: move $17, $6
-; CHECK-BE-NEXT: move $18, $5
-; CHECK-BE-NEXT: move $19, $4
-; CHECK-BE-NEXT: lw $1, 56($sp)
-; CHECK-BE-NEXT: andi $4, $1, 31
-; CHECK-BE-NEXT: lw $5, 60($sp)
-; CHECK-BE-NEXT: addiu $6, $zero, 0
-; CHECK-BE-NEXT: jal __umoddi3
-; CHECK-BE-NEXT: addiu $7, $zero, 37
-; CHECK-BE-NEXT: addiu $1, $3, 27
+; CHECK-BE-NEXT: lui $1, 1771
+; CHECK-BE-NEXT: ori $1, $1, 15941
+; CHECK-BE-NEXT: lw $2, 20($sp)
+; CHECK-BE-NEXT: multu $2, $1
+; CHECK-BE-NEXT: mfhi $3
+; CHECK-BE-NEXT: mflo $8
+; CHECK-BE-NEXT: lui $9, 12398
+; CHECK-BE-NEXT: ori $9, $9, 46053
+; CHECK-BE-NEXT: multu $2, $9
+; CHECK-BE-NEXT: mfhi $10
+; CHECK-BE-NEXT: lw $11, 16($sp)
+; CHECK-BE-NEXT: andi $11, $11, 31
+; CHECK-BE-NEXT: multu $11, $9
+; CHECK-BE-NEXT: mflo $9
+; CHECK-BE-NEXT: mfhi $12
+; CHECK-BE-NEXT: addu $8, $10, $8
+; CHECK-BE-NEXT: sltu $10, $8, $10
+; CHECK-BE-NEXT: addu $9, $8, $9
+; CHECK-BE-NEXT: sltu $8, $9, $8
+; CHECK-BE-NEXT: addu $3, $3, $10
+; CHECK-BE-NEXT: srl $9, $7, 5
+; CHECK-BE-NEXT: sll $6, $6, 27
+; CHECK-BE-NEXT: or $6, $6, $9
+; CHECK-BE-NEXT: sll $7, $7, 27
+; CHECK-BE-NEXT: addu $3, $3, $12
+; CHECK-BE-NEXT: addu $3, $3, $8
+; CHECK-BE-NEXT: mul $1, $11, $1
+; CHECK-BE-NEXT: addu $1, $3, $1
+; CHECK-BE-NEXT: sll $3, $1, 2
+; CHECK-BE-NEXT: addu $3, $3, $1
+; CHECK-BE-NEXT: sll $1, $1, 5
+; CHECK-BE-NEXT: addu $1, $1, $3
+; CHECK-BE-NEXT: subu $1, $2, $1
+; CHECK-BE-NEXT: addiu $1, $1, 27
; CHECK-BE-NEXT: andi $3, $1, 32
-; CHECK-BE-NEXT: srl $2, $16, 5
-; CHECK-BE-NEXT: sll $4, $17, 27
-; CHECK-BE-NEXT: or $4, $4, $2
-; CHECK-BE-NEXT: movz $19, $18, $3
-; CHECK-BE-NEXT: movz $18, $4, $3
-; CHECK-BE-NEXT: srlv $2, $18, $1
-; CHECK-BE-NEXT: not $5, $1
-; CHECK-BE-NEXT: sll $6, $19, 1
-; CHECK-BE-NEXT: sllv $6, $6, $5
-; CHECK-BE-NEXT: sll $7, $16, 27
-; CHECK-BE-NEXT: or $2, $6, $2
-; CHECK-BE-NEXT: movz $4, $7, $3
-; CHECK-BE-NEXT: srlv $1, $4, $1
-; CHECK-BE-NEXT: sll $3, $18, 1
-; CHECK-BE-NEXT: sllv $3, $3, $5
-; CHECK-BE-NEXT: or $3, $3, $1
-; CHECK-BE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $19, 32($sp) # 4-byte Folded Reload
-; CHECK-BE-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
+; CHECK-BE-NEXT: movz $4, $5, $3
+; CHECK-BE-NEXT: movz $5, $6, $3
+; CHECK-BE-NEXT: srlv $2, $5, $1
+; CHECK-BE-NEXT: not $8, $1
+; CHECK-BE-NEXT: sll $4, $4, 1
+; CHECK-BE-NEXT: sllv $4, $4, $8
+; CHECK-BE-NEXT: or $2, $4, $2
+; CHECK-BE-NEXT: movz $6, $7, $3
+; CHECK-BE-NEXT: srlv $1, $6, $1
+; CHECK-BE-NEXT: sll $3, $5, 1
+; CHECK-BE-NEXT: sllv $3, $3, $8
; CHECK-BE-NEXT: jr $ra
-; CHECK-BE-NEXT: addiu $sp, $sp, 40
+; CHECK-BE-NEXT: or $3, $3, $1
;
; CHECK-LE-LABEL: fshr_i37:
; CHECK-LE: # %bb.0:
-; CHECK-LE-NEXT: addiu $sp, $sp, -40
-; CHECK-LE-NEXT: .cfi_def_cfa_offset 40
-; CHECK-LE-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $19, 32($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $18, 28($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $17, 24($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: sw $16, 20($sp) # 4-byte Folded Spill
-; CHECK-LE-NEXT: .cfi_offset 31, -4
-; CHECK-LE-NEXT: .cfi_offset 19, -8
-; CHECK-LE-NEXT: .cfi_offset 18, -12
-; CHECK-LE-NEXT: .cfi_offset 17, -16
-; CHECK-LE-NEXT: .cfi_offset 16, -20
-; CHECK-LE-NEXT: move $16, $7
-; CHECK-LE-NEXT: move $17, $6
-; CHECK-LE-NEXT: move $18, $5
-; CHECK-LE-NEXT: move $19, $4
-; CHECK-LE-NEXT: lw $1, 60($sp)
-; CHECK-LE-NEXT: andi $5, $1, 31
-; CHECK-LE-NEXT: lw $4, 56($sp)
-; CHECK-LE-NEXT: addiu $6, $zero, 37
-; CHECK-LE-NEXT: jal __umoddi3
-; CHECK-LE-NEXT: addiu $7, $zero, 0
-; CHECK-LE-NEXT: addiu $1, $2, 27
+; CHECK-LE-NEXT: lui $1, 1771
+; CHECK-LE-NEXT: ori $1, $1, 15941
+; CHECK-LE-NEXT: lw $2, 16($sp)
+; CHECK-LE-NEXT: multu $2, $1
+; CHECK-LE-NEXT: mfhi $3
+; CHECK-LE-NEXT: mflo $8
+; CHECK-LE-NEXT: lui $9, 12398
+; CHECK-LE-NEXT: ori $9, $9, 46053
+; CHECK-LE-NEXT: multu $2, $9
+; CHECK-LE-NEXT: mfhi $10
+; CHECK-LE-NEXT: lw $11, 20($sp)
+; CHECK-LE-NEXT: andi $11, $11, 31
+; CHECK-LE-NEXT: multu $11, $9
+; CHECK-LE-NEXT: mflo $9
+; CHECK-LE-NEXT: mfhi $12
+; CHECK-LE-NEXT: addu $8, $10, $8
+; CHECK-LE-NEXT: sltu $10, $8, $10
+; CHECK-LE-NEXT: addu $9, $8, $9
+; CHECK-LE-NEXT: sltu $8, $9, $8
+; CHECK-LE-NEXT: addu $3, $3, $10
+; CHECK-LE-NEXT: srl $9, $6, 5
+; CHECK-LE-NEXT: sll $7, $7, 27
+; CHECK-LE-NEXT: or $7, $7, $9
+; CHECK-LE-NEXT: sll $6, $6, 27
+; CHECK-LE-NEXT: addu $3, $3, $12
+; CHECK-LE-NEXT: addu $3, $3, $8
+; CHECK-LE-NEXT: mul $1, $11, $1
+; CHECK-LE-NEXT: addu $1, $3, $1
+; CHECK-LE-NEXT: sll $3, $1, 2
+; CHECK-LE-NEXT: addu $3, $3, $1
+; CHECK-LE-NEXT: sll $1, $1, 5
+; CHECK-LE-NEXT: addu $1, $1, $3
+; CHECK-LE-NEXT: subu $1, $2, $1
+; CHECK-LE-NEXT: addiu $1, $1, 27
; CHECK-LE-NEXT: andi $3, $1, 32
-; CHECK-LE-NEXT: srl $2, $17, 5
-; CHECK-LE-NEXT: sll $4, $16, 27
-; CHECK-LE-NEXT: or $2, $4, $2
-; CHECK-LE-NEXT: sll $4, $17, 27
-; CHECK-LE-NEXT: move $5, $19
-; CHECK-LE-NEXT: movz $5, $2, $3
-; CHECK-LE-NEXT: movz $2, $4, $3
-; CHECK-LE-NEXT: srlv $2, $2, $1
-; CHECK-LE-NEXT: not $4, $1
-; CHECK-LE-NEXT: sll $6, $5, 1
-; CHECK-LE-NEXT: sllv $6, $6, $4
-; CHECK-LE-NEXT: or $2, $6, $2
-; CHECK-LE-NEXT: srlv $1, $5, $1
-; CHECK-LE-NEXT: movz $18, $19, $3
-; CHECK-LE-NEXT: sll $3, $18, 1
-; CHECK-LE-NEXT: sllv $3, $3, $4
-; CHECK-LE-NEXT: or $3, $3, $1
-; CHECK-LE-NEXT: lw $16, 20($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $17, 24($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $18, 28($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $19, 32($sp) # 4-byte Folded Reload
-; CHECK-LE-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
+; CHECK-LE-NEXT: move $8, $4
+; CHECK-LE-NEXT: movz $8, $7, $3
+; CHECK-LE-NEXT: movz $7, $6, $3
+; CHECK-LE-NEXT: srlv $2, $7, $1
+; CHECK-LE-NEXT: not $6, $1
+; CHECK-LE-NEXT: sll $7, $8, 1
+; CHECK-LE-NEXT: sllv $7, $7, $6
+; CHECK-LE-NEXT: or $2, $7, $2
+; CHECK-LE-NEXT: srlv $1, $8, $1
+; CHECK-LE-NEXT: movz $5, $4, $3
+; CHECK-LE-NEXT: sll $3, $5, 1
+; CHECK-LE-NEXT: sllv $3, $3, $6
; CHECK-LE-NEXT: jr $ra
-; CHECK-LE-NEXT: addiu $sp, $sp, 40
+; CHECK-LE-NEXT: or $3, $3, $1
%f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
}
diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift.ll b/llvm/test/CodeGen/PowerPC/funnel-shift.ll
index be95233656f47..952fede1d9b8d 100644
--- a/llvm/test/CodeGen/PowerPC/funnel-shift.ll
+++ b/llvm/test/CodeGen/PowerPC/funnel-shift.ll
@@ -270,116 +270,94 @@ declare i37 @llvm.fshl.i37(i37, i37, i37)
define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
; CHECK32_32-LABEL: fshl_i37:
; CHECK32_32: # %bb.0:
-; CHECK32_32-NEXT: mflr 0
-; CHECK32_32-NEXT: stwu 1, -32(1)
-; CHECK32_32-NEXT: stw 0, 36(1)
-; CHECK32_32-NEXT: .cfi_def_cfa_offset 32
-; CHECK32_32-NEXT: .cfi_offset lr, 4
-; CHECK32_32-NEXT: .cfi_offset r27, -20
-; CHECK32_32-NEXT: .cfi_offset r28, -16
-; CHECK32_32-NEXT: .cfi_offset r29, -12
-; CHECK32_32-NEXT: .cfi_offset r30, -8
-; CHECK32_32-NEXT: stw 27, 12(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 27, 5
-; CHECK32_32-NEXT: stw 28, 16(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 28, 3
-; CHECK32_32-NEXT: stw 29, 20(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 29, 4
-; CHECK32_32-NEXT: stw 30, 24(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 30, 6
-; CHECK32_32-NEXT: clrlwi 3, 7, 27
-; CHECK32_32-NEXT: mr 4, 8
-; CHECK32_32-NEXT: li 5, 0
-; CHECK32_32-NEXT: li 6, 37
-; CHECK32_32-NEXT: bl __umoddi3
-; CHECK32_32-NEXT: rotlwi 5, 30, 27
-; CHECK32_32-NEXT: rlwimi 5, 27, 27, 0, 4
-; CHECK32_32-NEXT: andi. 3, 4, 32
-; CHECK32_32-NEXT: mr 6, 5
+; CHECK32_32-NEXT: lis 9, 1771
+; CHECK32_32-NEXT: lis 11, 12398
+; CHECK32_32-NEXT: ori 9, 9, 15941
+; CHECK32_32-NEXT: clrlwi 7, 7, 27
+; CHECK32_32-NEXT: ori 11, 11, 46053
+; CHECK32_32-NEXT: mulhwu 10, 8, 9
+; CHECK32_32-NEXT: mulhwu 12, 7, 11
+; CHECK32_32-NEXT: mullw 0, 8, 9
+; CHECK32_32-NEXT: mullw 9, 7, 9
+; CHECK32_32-NEXT: mullw 7, 7, 11
+; CHECK32_32-NEXT: mulhwu 11, 8, 11
+; CHECK32_32-NEXT: addc 11, 11, 0
+; CHECK32_32-NEXT: addze 10, 10
+; CHECK32_32-NEXT: addc 7, 11, 7
+; CHECK32_32-NEXT: adde 7, 10, 12
+; CHECK32_32-NEXT: add 7, 7, 9
+; CHECK32_32-NEXT: mulli 7, 7, 37
+; CHECK32_32-NEXT: sub 8, 8, 7
+; CHECK32_32-NEXT: andi. 7, 8, 32
+; CHECK32_32-NEXT: rotlwi 7, 6, 27
+; CHECK32_32-NEXT: rlwimi 7, 5, 27, 0, 4
+; CHECK32_32-NEXT: mr 5, 7
; CHECK32_32-NEXT: bne 0, .LBB3_2
; CHECK32_32-NEXT: # %bb.1:
-; CHECK32_32-NEXT: mr 6, 29
+; CHECK32_32-NEXT: mr 5, 4
; CHECK32_32-NEXT: .LBB3_2:
-; CHECK32_32-NEXT: clrlwi 4, 4, 27
-; CHECK32_32-NEXT: subfic 7, 4, 32
-; CHECK32_32-NEXT: srw 3, 6, 7
+; CHECK32_32-NEXT: clrlwi 8, 8, 27
+; CHECK32_32-NEXT: subfic 9, 8, 32
+; CHECK32_32-NEXT: srw 10, 5, 9
; CHECK32_32-NEXT: bne 0, .LBB3_4
; CHECK32_32-NEXT: # %bb.3:
-; CHECK32_32-NEXT: mr 29, 28
+; CHECK32_32-NEXT: mr 4, 3
; CHECK32_32-NEXT: .LBB3_4:
-; CHECK32_32-NEXT: slw 8, 29, 4
-; CHECK32_32-NEXT: or 3, 8, 3
+; CHECK32_32-NEXT: slw 3, 4, 8
+; CHECK32_32-NEXT: or 3, 3, 10
; CHECK32_32-NEXT: beq 0, .LBB3_6
; CHECK32_32-NEXT: # %bb.5:
-; CHECK32_32-NEXT: slwi 5, 30, 27
+; CHECK32_32-NEXT: slwi 7, 6, 27
; CHECK32_32-NEXT: .LBB3_6:
-; CHECK32_32-NEXT: srw 5, 5, 7
-; CHECK32_32-NEXT: slw 4, 6, 4
-; CHECK32_32-NEXT: or 4, 4, 5
-; CHECK32_32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 28, 16(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 27, 12(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 0, 36(1)
-; CHECK32_32-NEXT: addi 1, 1, 32
-; CHECK32_32-NEXT: mtlr 0
+; CHECK32_32-NEXT: srw 4, 7, 9
+; CHECK32_32-NEXT: slw 5, 5, 8
+; CHECK32_32-NEXT: or 4, 5, 4
; CHECK32_32-NEXT: blr
;
; CHECK32_64-LABEL: fshl_i37:
; CHECK32_64: # %bb.0:
-; CHECK32_64-NEXT: mflr 0
-; CHECK32_64-NEXT: stwu 1, -32(1)
-; CHECK32_64-NEXT: stw 0, 36(1)
-; CHECK32_64-NEXT: .cfi_def_cfa_offset 32
-; CHECK32_64-NEXT: .cfi_offset lr, 4
-; CHECK32_64-NEXT: .cfi_offset r27, -20
-; CHECK32_64-NEXT: .cfi_offset r28, -16
-; CHECK32_64-NEXT: .cfi_offset r29, -12
-; CHECK32_64-NEXT: .cfi_offset r30, -8
-; CHECK32_64-NEXT: stw 27, 12(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 27, 5
-; CHECK32_64-NEXT: li 5, 0
-; CHECK32_64-NEXT: stw 28, 16(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 28, 3
-; CHECK32_64-NEXT: clrlwi 3, 7, 27
-; CHECK32_64-NEXT: stw 29, 20(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 29, 4
-; CHECK32_64-NEXT: mr 4, 8
-; CHECK32_64-NEXT: stw 30, 24(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 30, 6
-; CHECK32_64-NEXT: li 6, 37
-; CHECK32_64-NEXT: bl __umoddi3
-; CHECK32_64-NEXT: rotlwi 5, 30, 27
-; CHECK32_64-NEXT: andi. 3, 4, 32
-; CHECK32_64-NEXT: rlwimi 5, 27, 27, 0, 4
-; CHECK32_64-NEXT: mr 6, 5
+; CHECK32_64-NEXT: lis 9, 1771
+; CHECK32_64-NEXT: lis 12, 12398
+; CHECK32_64-NEXT: ori 9, 9, 15941
+; CHECK32_64-NEXT: clrlwi 7, 7, 27
+; CHECK32_64-NEXT: ori 12, 12, 46053
+; CHECK32_64-NEXT: mulhwu 10, 8, 9
+; CHECK32_64-NEXT: mullw 11, 8, 9
+; CHECK32_64-NEXT: mulhwu 0, 7, 12
+; CHECK32_64-NEXT: mullw 9, 7, 9
+; CHECK32_64-NEXT: mullw 7, 7, 12
+; CHECK32_64-NEXT: mulhwu 12, 8, 12
+; CHECK32_64-NEXT: addc 11, 12, 11
+; CHECK32_64-NEXT: addze 10, 10
+; CHECK32_64-NEXT: addc 7, 11, 7
+; CHECK32_64-NEXT: adde 7, 10, 0
+; CHECK32_64-NEXT: add 7, 7, 9
+; CHECK32_64-NEXT: mulli 7, 7, 37
+; CHECK32_64-NEXT: sub 8, 8, 7
+; CHECK32_64-NEXT: andi. 7, 8, 32
+; CHECK32_64-NEXT: rotlwi 7, 6, 27
+; CHECK32_64-NEXT: rlwimi 7, 5, 27, 0, 4
+; CHECK32_64-NEXT: mr 5, 7
; CHECK32_64-NEXT: bne 0, .LBB3_2
; CHECK32_64-NEXT: # %bb.1:
-; CHECK32_64-NEXT: mr 6, 29
+; CHECK32_64-NEXT: mr 5, 4
; CHECK32_64-NEXT: .LBB3_2:
-; CHECK32_64-NEXT: clrlwi 4, 4, 27
-; CHECK32_64-NEXT: subfic 7, 4, 32
-; CHECK32_64-NEXT: srw 3, 6, 7
+; CHECK32_64-NEXT: clrlwi 8, 8, 27
+; CHECK32_64-NEXT: subfic 9, 8, 32
+; CHECK32_64-NEXT: srw 10, 5, 9
; CHECK32_64-NEXT: bne 0, .LBB3_4
; CHECK32_64-NEXT: # %bb.3:
-; CHECK32_64-NEXT: mr 29, 28
+; CHECK32_64-NEXT: mr 4, 3
; CHECK32_64-NEXT: .LBB3_4:
-; CHECK32_64-NEXT: slw 8, 29, 4
-; CHECK32_64-NEXT: or 3, 8, 3
+; CHECK32_64-NEXT: slw 3, 4, 8
+; CHECK32_64-NEXT: or 3, 3, 10
; CHECK32_64-NEXT: beq 0, .LBB3_6
; CHECK32_64-NEXT: # %bb.5:
-; CHECK32_64-NEXT: slwi 5, 30, 27
+; CHECK32_64-NEXT: slwi 7, 6, 27
; CHECK32_64-NEXT: .LBB3_6:
-; CHECK32_64-NEXT: srw 5, 5, 7
-; CHECK32_64-NEXT: slw 4, 6, 4
-; CHECK32_64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: or 4, 4, 5
-; CHECK32_64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: lwz 28, 16(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: lwz 27, 12(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: lwz 0, 36(1)
-; CHECK32_64-NEXT: addi 1, 1, 32
-; CHECK32_64-NEXT: mtlr 0
+; CHECK32_64-NEXT: srw 4, 7, 9
+; CHECK32_64-NEXT: slw 5, 5, 8
+; CHECK32_64-NEXT: or 4, 5, 4
; CHECK32_64-NEXT: blr
;
; CHECK64-LABEL: fshl_i37:
@@ -536,118 +514,96 @@ declare i37 @llvm.fshr.i37(i37, i37, i37)
define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
; CHECK32_32-LABEL: fshr_i37:
; CHECK32_32: # %bb.0:
-; CHECK32_32-NEXT: mflr 0
-; CHECK32_32-NEXT: stwu 1, -32(1)
-; CHECK32_32-NEXT: stw 0, 36(1)
-; CHECK32_32-NEXT: .cfi_def_cfa_offset 32
-; CHECK32_32-NEXT: .cfi_offset lr, 4
-; CHECK32_32-NEXT: .cfi_offset r27, -20
-; CHECK32_32-NEXT: .cfi_offset r28, -16
-; CHECK32_32-NEXT: .cfi_offset r29, -12
-; CHECK32_32-NEXT: .cfi_offset r30, -8
-; CHECK32_32-NEXT: stw 27, 12(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 27, 5
-; CHECK32_32-NEXT: stw 28, 16(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 28, 3
-; CHECK32_32-NEXT: stw 29, 20(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 29, 4
-; CHECK32_32-NEXT: stw 30, 24(1) # 4-byte Folded Spill
-; CHECK32_32-NEXT: mr 30, 6
-; CHECK32_32-NEXT: clrlwi 3, 7, 27
-; CHECK32_32-NEXT: mr 4, 8
-; CHECK32_32-NEXT: li 5, 0
-; CHECK32_32-NEXT: li 6, 37
-; CHECK32_32-NEXT: bl __umoddi3
-; CHECK32_32-NEXT: rotlwi 5, 30, 27
-; CHECK32_32-NEXT: addi 3, 4, 27
-; CHECK32_32-NEXT: andi. 4, 3, 32
-; CHECK32_32-NEXT: rlwimi 5, 27, 27, 0, 4
-; CHECK32_32-NEXT: mr 4, 5
+; CHECK32_32-NEXT: lis 9, 1771
+; CHECK32_32-NEXT: lis 11, 12398
+; CHECK32_32-NEXT: ori 9, 9, 15941
+; CHECK32_32-NEXT: clrlwi 7, 7, 27
+; CHECK32_32-NEXT: ori 11, 11, 46053
+; CHECK32_32-NEXT: mulhwu 10, 8, 9
+; CHECK32_32-NEXT: mulhwu 12, 7, 11
+; CHECK32_32-NEXT: mullw 0, 8, 9
+; CHECK32_32-NEXT: mullw 9, 7, 9
+; CHECK32_32-NEXT: mullw 7, 7, 11
+; CHECK32_32-NEXT: mulhwu 11, 8, 11
+; CHECK32_32-NEXT: addc 11, 11, 0
+; CHECK32_32-NEXT: addze 10, 10
+; CHECK32_32-NEXT: addc 7, 11, 7
+; CHECK32_32-NEXT: adde 7, 10, 12
+; CHECK32_32-NEXT: add 7, 7, 9
+; CHECK32_32-NEXT: mulli 7, 7, 37
+; CHECK32_32-NEXT: sub 7, 8, 7
+; CHECK32_32-NEXT: addi 8, 7, 27
+; CHECK32_32-NEXT: andi. 7, 8, 32
+; CHECK32_32-NEXT: rotlwi 7, 6, 27
+; CHECK32_32-NEXT: rlwimi 7, 5, 27, 0, 4
+; CHECK32_32-NEXT: mr 5, 7
; CHECK32_32-NEXT: beq 0, .LBB11_2
; CHECK32_32-NEXT: # %bb.1:
-; CHECK32_32-NEXT: mr 4, 29
+; CHECK32_32-NEXT: mr 5, 4
; CHECK32_32-NEXT: .LBB11_2:
-; CHECK32_32-NEXT: clrlwi 6, 3, 27
-; CHECK32_32-NEXT: srw 3, 4, 6
+; CHECK32_32-NEXT: clrlwi 8, 8, 27
+; CHECK32_32-NEXT: srw 10, 5, 8
; CHECK32_32-NEXT: beq 0, .LBB11_4
; CHECK32_32-NEXT: # %bb.3:
-; CHECK32_32-NEXT: mr 29, 28
+; CHECK32_32-NEXT: mr 4, 3
; CHECK32_32-NEXT: .LBB11_4:
-; CHECK32_32-NEXT: subfic 7, 6, 32
-; CHECK32_32-NEXT: slw 8, 29, 7
-; CHECK32_32-NEXT: or 3, 8, 3
+; CHECK32_32-NEXT: subfic 9, 8, 32
+; CHECK32_32-NEXT: slw 3, 4, 9
+; CHECK32_32-NEXT: or 3, 3, 10
; CHECK32_32-NEXT: bne 0, .LBB11_6
; CHECK32_32-NEXT: # %bb.5:
-; CHECK32_32-NEXT: slwi 5, 30, 27
+; CHECK32_32-NEXT: slwi 7, 6, 27
; CHECK32_32-NEXT: .LBB11_6:
-; CHECK32_32-NEXT: srw 5, 5, 6
-; CHECK32_32-NEXT: slw 4, 4, 7
-; CHECK32_32-NEXT: or 4, 4, 5
-; CHECK32_32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 28, 16(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 27, 12(1) # 4-byte Folded Reload
-; CHECK32_32-NEXT: lwz 0, 36(1)
-; CHECK32_32-NEXT: addi 1, 1, 32
-; CHECK32_32-NEXT: mtlr 0
+; CHECK32_32-NEXT: srw 4, 7, 8
+; CHECK32_32-NEXT: slw 5, 5, 9
+; CHECK32_32-NEXT: or 4, 5, 4
; CHECK32_32-NEXT: blr
;
; CHECK32_64-LABEL: fshr_i37:
; CHECK32_64: # %bb.0:
-; CHECK32_64-NEXT: mflr 0
-; CHECK32_64-NEXT: stwu 1, -32(1)
-; CHECK32_64-NEXT: stw 0, 36(1)
-; CHECK32_64-NEXT: .cfi_def_cfa_offset 32
-; CHECK32_64-NEXT: .cfi_offset lr, 4
-; CHECK32_64-NEXT: .cfi_offset r27, -20
-; CHECK32_64-NEXT: .cfi_offset r28, -16
-; CHECK32_64-NEXT: .cfi_offset r29, -12
-; CHECK32_64-NEXT: .cfi_offset r30, -8
-; CHECK32_64-NEXT: stw 27, 12(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 27, 5
-; CHECK32_64-NEXT: li 5, 0
-; CHECK32_64-NEXT: stw 28, 16(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 28, 3
-; CHECK32_64-NEXT: clrlwi 3, 7, 27
-; CHECK32_64-NEXT: stw 29, 20(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 29, 4
-; CHECK32_64-NEXT: mr 4, 8
-; CHECK32_64-NEXT: stw 30, 24(1) # 4-byte Folded Spill
-; CHECK32_64-NEXT: mr 30, 6
-; CHECK32_64-NEXT: li 6, 37
-; CHECK32_64-NEXT: bl __umoddi3
-; CHECK32_64-NEXT: rotlwi 5, 30, 27
-; CHECK32_64-NEXT: addi 3, 4, 27
-; CHECK32_64-NEXT: andi. 4, 3, 32
-; CHECK32_64-NEXT: rlwimi 5, 27, 27, 0, 4
-; CHECK32_64-NEXT: mr 4, 5
+; CHECK32_64-NEXT: lis 9, 1771
+; CHECK32_64-NEXT: lis 12, 12398
+; CHECK32_64-NEXT: ori 9, 9, 15941
+; CHECK32_64-NEXT: clrlwi 7, 7, 27
+; CHECK32_64-NEXT: ori 12, 12, 46053
+; CHECK32_64-NEXT: mulhwu 10, 8, 9
+; CHECK32_64-NEXT: mullw 11, 8, 9
+; CHECK32_64-NEXT: mulhwu 0, 7, 12
+; CHECK32_64-NEXT: mullw 9, 7, 9
+; CHECK32_64-NEXT: mullw 7, 7, 12
+; CHECK32_64-NEXT: mulhwu 12, 8, 12
+; CHECK32_64-NEXT: addc 11, 12, 11
+; CHECK32_64-NEXT: addze 10, 10
+; CHECK32_64-NEXT: addc 7, 11, 7
+; CHECK32_64-NEXT: adde 7, 10, 0
+; CHECK32_64-NEXT: add 7, 7, 9
+; CHECK32_64-NEXT: mulli 7, 7, 37
+; CHECK32_64-NEXT: sub 7, 8, 7
+; CHECK32_64-NEXT: addi 8, 7, 27
+; CHECK32_64-NEXT: andi. 7, 8, 32
+; CHECK32_64-NEXT: rotlwi 7, 6, 27
+; CHECK32_64-NEXT: rlwimi 7, 5, 27, 0, 4
+; CHECK32_64-NEXT: mr 5, 7
; CHECK32_64-NEXT: beq 0, .LBB11_2
; CHECK32_64-NEXT: # %bb.1:
-; CHECK32_64-NEXT: mr 4, 29
+; CHECK32_64-NEXT: mr 5, 4
; CHECK32_64-NEXT: .LBB11_2:
-; CHECK32_64-NEXT: clrlwi 6, 3, 27
-; CHECK32_64-NEXT: srw 3, 4, 6
+; CHECK32_64-NEXT: clrlwi 8, 8, 27
+; CHECK32_64-NEXT: srw 10, 5, 8
; CHECK32_64-NEXT: beq 0, .LBB11_4
; CHECK32_64-NEXT: # %bb.3:
-; CHECK32_64-NEXT: mr 29, 28
+; CHECK32_64-NEXT: mr 4, 3
; CHECK32_64-NEXT: .LBB11_4:
-; CHECK32_64-NEXT: subfic 7, 6, 32
-; CHECK32_64-NEXT: slw 8, 29, 7
-; CHECK32_64-NEXT: or 3, 8, 3
+; CHECK32_64-NEXT: subfic 9, 8, 32
+; CHECK32_64-NEXT: slw 3, 4, 9
+; CHECK32_64-NEXT: or 3, 3, 10
; CHECK32_64-NEXT: bne 0, .LBB11_6
; CHECK32_64-NEXT: # %bb.5:
-; CHECK32_64-NEXT: slwi 5, 30, 27
+; CHECK32_64-NEXT: slwi 7, 6, 27
; CHECK32_64-NEXT: .LBB11_6:
-; CHECK32_64-NEXT: srw 5, 5, 6
-; CHECK32_64-NEXT: slw 4, 4, 7
-; CHECK32_64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: or 4, 4, 5
-; CHECK32_64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: lwz 28, 16(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: lwz 27, 12(1) # 4-byte Folded Reload
-; CHECK32_64-NEXT: lwz 0, 36(1)
-; CHECK32_64-NEXT: addi 1, 1, 32
-; CHECK32_64-NEXT: mtlr 0
+; CHECK32_64-NEXT: srw 4, 7, 8
+; CHECK32_64-NEXT: slw 5, 5, 9
+; CHECK32_64-NEXT: or 4, 5, 4
; CHECK32_64-NEXT: blr
;
; CHECK64-LABEL: fshr_i37:
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index ce3e2c1a4f7be..bb28336c15c3b 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -851,78 +851,142 @@ define <4 x i64> @fold_urem_i64(<4 x i64> %x) nounwind {
;
; RV32IM-LABEL: fold_urem_i64:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: addi sp, sp, -48
-; RV32IM-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IM-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IM-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
-; RV32IM-NEXT: sw s2, 32(sp) # 4-byte Folded Spill
-; RV32IM-NEXT: sw s3, 28(sp) # 4-byte Folded Spill
-; RV32IM-NEXT: sw s4, 24(sp) # 4-byte Folded Spill
-; RV32IM-NEXT: sw s5, 20(sp) # 4-byte Folded Spill
-; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill
-; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: addi sp, sp, -32
+; RV32IM-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s5, 4(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mv a2, a1
; RV32IM-NEXT: mv s0, a0
-; RV32IM-NEXT: lw a2, 16(a1)
-; RV32IM-NEXT: lw a4, 20(a1)
-; RV32IM-NEXT: lw s1, 24(a1)
-; RV32IM-NEXT: lw s2, 28(a1)
+; RV32IM-NEXT: lw a7, 16(a1)
+; RV32IM-NEXT: lw a6, 20(a1)
+; RV32IM-NEXT: lw a3, 24(a1)
+; RV32IM-NEXT: lw a5, 28(a1)
; RV32IM-NEXT: lw a0, 0(a1)
-; RV32IM-NEXT: lw a3, 4(a1)
-; RV32IM-NEXT: lw s3, 8(a1)
-; RV32IM-NEXT: lw s4, 12(a1)
-; RV32IM-NEXT: lui a1, 1024
-; RV32IM-NEXT: slli a5, a4, 10
-; RV32IM-NEXT: srli a6, a2, 22
-; RV32IM-NEXT: or a5, a6, a5
-; RV32IM-NEXT: lui a6, 45590
-; RV32IM-NEXT: addi a1, a1, -1
-; RV32IM-NEXT: addi a6, a6, 1069
-; RV32IM-NEXT: and a2, a2, a1
-; RV32IM-NEXT: srli a4, a4, 12
-; RV32IM-NEXT: add a2, a2, a4
-; RV32IM-NEXT: and a1, a5, a1
-; RV32IM-NEXT: add a1, a2, a1
-; RV32IM-NEXT: mulhu a2, a1, a6
-; RV32IM-NEXT: li a4, 23
-; RV32IM-NEXT: mul a2, a2, a4
-; RV32IM-NEXT: sub s7, a1, a2
+; RV32IM-NEXT: lw a1, 4(a1)
+; RV32IM-NEXT: lw a4, 8(a2)
+; RV32IM-NEXT: lw a2, 12(a2)
+; RV32IM-NEXT: lui t0, 410452
+; RV32IM-NEXT: lui t1, 25653
+; RV32IM-NEXT: lui t2, 791991
+; RV32IM-NEXT: lui t3, 834723
+; RV32IM-NEXT: lui t4, 1024
+; RV32IM-NEXT: addi t0, t0, -952
+; RV32IM-NEXT: addi t1, t1, 965
+; RV32IM-NEXT: addi t2, t2, 77
+; RV32IM-NEXT: addi t3, t3, -179
+; RV32IM-NEXT: addi t4, t4, -1
+; RV32IM-NEXT: srli t5, a4, 1
+; RV32IM-NEXT: slli t6, a2, 31
+; RV32IM-NEXT: srli s1, a2, 1
+; RV32IM-NEXT: mul s2, a3, t2
+; RV32IM-NEXT: and s3, a7, t4
+; RV32IM-NEXT: slli s4, a6, 10
+; RV32IM-NEXT: srli a7, a7, 22
+; RV32IM-NEXT: srli a6, a6, 12
+; RV32IM-NEXT: or t5, t6, t5
+; RV32IM-NEXT: mul t6, s1, t1
+; RV32IM-NEXT: mulhu s5, s1, t1
+; RV32IM-NEXT: or a7, a7, s4
+; RV32IM-NEXT: mul s4, s1, t0
+; RV32IM-NEXT: mulhu s1, s1, t0
+; RV32IM-NEXT: add a6, s3, a6
+; RV32IM-NEXT: mul s3, t5, t0
+; RV32IM-NEXT: mulhu t1, t5, t1
+; RV32IM-NEXT: mulhu t0, t5, t0
+; RV32IM-NEXT: mulhu t5, a3, t3
+; RV32IM-NEXT: and a7, a7, t4
+; RV32IM-NEXT: mul t4, a5, t3
+; RV32IM-NEXT: add s2, t5, s2
+; RV32IM-NEXT: add t4, s2, t4
+; RV32IM-NEXT: sltu t5, s2, t5
+; RV32IM-NEXT: sltu t4, t4, s2
+; RV32IM-NEXT: mulhu s2, a3, t2
+; RV32IM-NEXT: add t5, s2, t5
+; RV32IM-NEXT: add a6, a6, a7
+; RV32IM-NEXT: add s3, t1, s3
+; RV32IM-NEXT: add t6, s3, t6
+; RV32IM-NEXT: sltu a7, s3, t1
+; RV32IM-NEXT: sltu t1, t6, s3
+; RV32IM-NEXT: lui t6, 45590
+; RV32IM-NEXT: add a7, t0, a7
+; RV32IM-NEXT: li t0, 23
+; RV32IM-NEXT: addi t6, t6, 1069
+; RV32IM-NEXT: mulhu t3, a5, t3
+; RV32IM-NEXT: add t3, t5, t3
+; RV32IM-NEXT: mulhu t6, a6, t6
+; RV32IM-NEXT: sltu t5, t3, t5
+; RV32IM-NEXT: add t3, t3, t4
+; RV32IM-NEXT: mul t0, t6, t0
+; RV32IM-NEXT: seqz t6, t3
+; RV32IM-NEXT: and t4, t6, t4
+; RV32IM-NEXT: or t4, t5, t4
+; RV32IM-NEXT: mul t5, a5, t2
+; RV32IM-NEXT: mulhu t2, a5, t2
+; RV32IM-NEXT: add s5, a7, s5
+; RV32IM-NEXT: add t5, t3, t5
+; RV32IM-NEXT: sltu a7, s5, a7
+; RV32IM-NEXT: add s5, s5, t1
+; RV32IM-NEXT: sltu t3, t5, t3
+; RV32IM-NEXT: add t2, t3, t2
+; RV32IM-NEXT: seqz t3, s5
+; RV32IM-NEXT: and t1, t3, t1
+; RV32IM-NEXT: add t2, t2, t4
+; RV32IM-NEXT: or a7, a7, t1
+; RV32IM-NEXT: li t1, 654
+; RV32IM-NEXT: add s4, s5, s4
+; RV32IM-NEXT: sltu t3, s4, s5
+; RV32IM-NEXT: add t3, t3, s1
+; RV32IM-NEXT: lui t4, 1
+; RV32IM-NEXT: addi t4, t4, 1327
+; RV32IM-NEXT: srli t5, t5, 12
+; RV32IM-NEXT: srli t6, s4, 7
+; RV32IM-NEXT: add a7, t3, a7
+; RV32IM-NEXT: srli t3, t2, 12
+; RV32IM-NEXT: slli t2, t2, 20
+; RV32IM-NEXT: mul t3, t3, t4
+; RV32IM-NEXT: or t2, t2, t5
+; RV32IM-NEXT: srli t5, a7, 7
+; RV32IM-NEXT: slli a7, a7, 25
+; RV32IM-NEXT: sub a5, a5, t3
+; RV32IM-NEXT: mulhu t3, t2, t4
+; RV32IM-NEXT: mul t2, t2, t4
+; RV32IM-NEXT: mul t4, t5, t1
+; RV32IM-NEXT: or a7, a7, t6
+; RV32IM-NEXT: sub a5, a5, t3
+; RV32IM-NEXT: sub s1, a3, t2
+; RV32IM-NEXT: mulhu t2, a7, t1
+; RV32IM-NEXT: sub a2, a2, t4
+; RV32IM-NEXT: mul a7, a7, t1
+; RV32IM-NEXT: sltu a3, a3, s1
+; RV32IM-NEXT: sub a2, a2, t2
+; RV32IM-NEXT: sub s2, a4, a7
+; RV32IM-NEXT: sub s3, a5, a3
+; RV32IM-NEXT: sltu a3, a4, s2
+; RV32IM-NEXT: sub s4, a2, a3
+; RV32IM-NEXT: sub s5, a6, t0
; RV32IM-NEXT: li a2, 1
-; RV32IM-NEXT: mv a1, a3
-; RV32IM-NEXT: li a3, 0
-; RV32IM-NEXT: call __umoddi3
-; RV32IM-NEXT: mv s5, a0
-; RV32IM-NEXT: mv s6, a1
-; RV32IM-NEXT: li a2, 654
-; RV32IM-NEXT: mv a0, s3
-; RV32IM-NEXT: mv a1, s4
-; RV32IM-NEXT: li a3, 0
-; RV32IM-NEXT: call __umoddi3
-; RV32IM-NEXT: mv s3, a0
-; RV32IM-NEXT: mv s4, a1
-; RV32IM-NEXT: lui a2, 1
-; RV32IM-NEXT: addi a2, a2, 1327
-; RV32IM-NEXT: mv a0, s1
-; RV32IM-NEXT: mv a1, s2
; RV32IM-NEXT: li a3, 0
; RV32IM-NEXT: call __umoddi3
-; RV32IM-NEXT: sw s7, 16(s0)
+; RV32IM-NEXT: sw s5, 16(s0)
; RV32IM-NEXT: sw zero, 20(s0)
-; RV32IM-NEXT: sw a0, 24(s0)
-; RV32IM-NEXT: sw a1, 28(s0)
-; RV32IM-NEXT: sw s5, 0(s0)
-; RV32IM-NEXT: sw s6, 4(s0)
-; RV32IM-NEXT: sw s3, 8(s0)
+; RV32IM-NEXT: sw s1, 24(s0)
+; RV32IM-NEXT: sw s3, 28(s0)
+; RV32IM-NEXT: sw a0, 0(s0)
+; RV32IM-NEXT: sw a1, 4(s0)
+; RV32IM-NEXT: sw s2, 8(s0)
; RV32IM-NEXT: sw s4, 12(s0)
-; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
-; RV32IM-NEXT: lw s2, 32(sp) # 4-byte Folded Reload
-; RV32IM-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
-; RV32IM-NEXT: lw s4, 24(sp) # 4-byte Folded Reload
-; RV32IM-NEXT: lw s5, 20(sp) # 4-byte Folded Reload
-; RV32IM-NEXT: lw s6, 16(sp) # 4-byte Folded Reload
-; RV32IM-NEXT: lw s7, 12(sp) # 4-byte Folded Reload
-; RV32IM-NEXT: addi sp, sp, 48
+; RV32IM-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s5, 4(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: addi sp, sp, 32
; RV32IM-NEXT: ret
;
; RV64I-LABEL: fold_urem_i64:
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index b0ecae63b9bfb..2bce7f751e46f 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -294,25 +294,55 @@ entry:
define i64 @PR23590(i64 %x) nounwind {
; X86-LABEL: PR23590:
; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %eax
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $12345 # imm = 0x3039
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1425045447, %edx # imm = 0x54F077C7
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl $417841695, %edx # imm = 0x18E7C21F
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl $1425045447, %edx # imm = 0x54F077C7
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl $417841695, %edx # imm = 0x18E7C21F
+; X86-NEXT: mull %edx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: shrdl $12, %ebx, %edi
+; X86-NEXT: movl $12345, %edx # imm = 0x3039
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: shrl $12, %ebx
+; X86-NEXT: imull $12345, %ebx, %edi # imm = 0x3039
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: subl %eax, %esi
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shrdl $30, %ecx, %eax
; X86-NEXT: andl $1073741823, %eax # imm = 0x3FFFFFFF
; X86-NEXT: movl %esi, %edx
-; X86-NEXT: shrdl $30, %ecx, %edx
; X86-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: shrl $28, %edi
-; X86-NEXT: addl %eax, %edi
; X86-NEXT: addl %edx, %edi
+; X86-NEXT: addl %eax, %edi
; X86-NEXT: movl $613566757, %edx # imm = 0x24924925
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %edx
@@ -336,6 +366,8 @@ define i64 @PR23590(i64 %x) nounwind {
; X86-NEXT: addl $4, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-FAST-LABEL: PR23590:
@@ -376,27 +408,43 @@ define { i64, i32 } @PR38622(i64) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $-294967296 # imm = 0xEE6B2800
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $-294967296 # imm = 0xEE6B2800
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: shrdl $11, %edi, %ebx
+; X86-NEXT: movl $1125899, %edx # imm = 0x112E0B
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl $-400107883, %edx # imm = 0xE826D695
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: shrl $11, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl $1125899, %edx # imm = 0x112E0B
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl $-400107883, %edx # imm = 0xE826D695
+; X86-NEXT: mull %edx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl %edx, %esi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: shrdl $9, %ebx, %esi
+; X86-NEXT: imull $-294967296, %esi, %eax # imm = 0xEE6B2800
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: shrl $9, %ebx
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: addl $4, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -1194,13 +1242,41 @@ entry:
define i64 @udiv_i64_magic_large_postshift(i64 %x) nounwind {
; X86-LABEL: udiv_i64_magic_large_postshift:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $-1073741824 # imm = 0xC0000000
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl $-1431655766, %ecx # imm = 0xAAAAAAAA
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl $-1431655765, %esi # imm = 0xAAAAAAAB
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %ebp, %ebx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $-1431655766, %edx # imm = 0xAAAAAAAA
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: shrl $31, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_magic_large_postshift:
@@ -1219,13 +1295,44 @@ define i64 @udiv_i64_magic_large_postshift(i64 %x) nounwind {
define i64 @urem_i64_magic_large_postshift(i64 %x) nounwind {
; X86-LABEL: urem_i64_magic_large_postshift:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $-1073741824 # imm = 0xC0000000
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl $-1431655766, %ebx # imm = 0xAAAAAAAA
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X86-NEXT: mull %edx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: shrl %ebx
+; X86-NEXT: andl $1073741824, %ebx # imm = 0x40000000
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: urem_i64_magic_large_postshift:
@@ -1246,13 +1353,24 @@ define i64 @urem_i64_magic_large_postshift(i64 %x) nounwind {
define i64 @udiv_i64_magic_large_preshift(i64 %x) nounwind {
; X86-LABEL: udiv_i64_magic_large_preshift:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $14
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shrl %ecx
+; X86-NEXT: movl $613566756, %edx # imm = 0x24924924
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl $-1840700269, %edx # imm = 0x92492493
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_magic_large_preshift:
@@ -1271,13 +1389,37 @@ define i64 @udiv_i64_magic_large_preshift(i64 %x) nounwind {
define i64 @urem_i64_magic_large_preshift(i64 %x) nounwind {
; X86-LABEL: urem_i64_magic_large_preshift:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $14
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %ebx
+; X86-NEXT: shrl %ebx
+; X86-NEXT: movl $613566756, %edx # imm = 0x24924924
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl $-1840700269, %edx # imm = 0x92492493
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: adcl $0, %eax
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: shll $4, %eax
+; X86-NEXT: subl %eax, %esi
+; X86-NEXT: addl %ebp, %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: urem_i64_magic_large_preshift:
@@ -1299,13 +1441,51 @@ define i64 @urem_i64_magic_large_preshift(i64 %x) nounwind {
define i64 @udiv_i64_magic_is_add(i64 %x) nounwind {
; X86-LABEL: udiv_i64_magic_is_add:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $196608 # imm = 0x30000
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1431626638, %edx # imm = 0x5554E38E
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl $1590754519, %edx # imm = 0x5ED0FCD7
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl $1431626638, %edx # imm = 0x5554E38E
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl $1590754519, %edx # imm = 0x5ED0FCD7
+; X86-NEXT: mull %edx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: subl %edx, %esi
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: shrdl $1, %ecx, %esi
+; X86-NEXT: shrl %ecx
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: shrl $17, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_magic_is_add:
@@ -1326,13 +1506,61 @@ define i64 @udiv_i64_magic_is_add(i64 %x) nounwind {
define i64 @urem_i64_magic_is_add(i64 %x) nounwind {
; X86-LABEL: urem_i64_magic_is_add:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $196608 # imm = 0x30000
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl $1431626638, %edx # imm = 0x5554E38E
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl $1590754519, %edx # imm = 0x5ED0FCD7
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl $1431626638, %edx # imm = 0x5554E38E
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl $1590754519, %edx # imm = 0x5ED0FCD7
+; X86-NEXT: mull %edx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: subl %edx, %eax
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: sbbl %edi, %ebx
+; X86-NEXT: shrdl $1, %ebx, %eax
+; X86-NEXT: shrl %ebx
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: shrl $17, %ebx
+; X86-NEXT: movl $-1, %edx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: shll $16, %ebx
+; X86-NEXT: leal (%ebx,%ebx,2), %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: sbbl %edi, %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: urem_i64_magic_is_add:
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 34d29829f5b35..554dd5dbe10ab 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -1173,27 +1173,66 @@ entry:
define i128 @udiv_magic_preshift_and_postshift(i128 %x) nounwind {
; X86-64-LABEL: udiv_magic_preshift_and_postshift:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movl $202, %edx
-; X86-64-NEXT: xorl %ecx, %ecx
-; X86-64-NEXT: callq __udivti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movabsq $-6757718126012409998, %r9 # imm = 0xA237C32B16CFD772
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r9
+; X86-64-NEXT: movq %rax, %r8
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movabsq $1095846182596607027, %r10 # imm = 0xF353A4C0A237C33
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r10
+; X86-64-NEXT: movq %rdx, %rdi
+; X86-64-NEXT: addq %r8, %rdi
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r9
+; X86-64-NEXT: movq %rdx, %r8
+; X86-64-NEXT: movq %rax, %r9
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r10
+; X86-64-NEXT: addq %rdi, %rax
+; X86-64-NEXT: adcq %rdx, %rcx
+; X86-64-NEXT: adcq $0, %r8
+; X86-64-NEXT: addq %r9, %rcx
+; X86-64-NEXT: adcq $0, %r8
+; X86-64-NEXT: shrdq $7, %r8, %rcx
+; X86-64-NEXT: shrq $7, %r8
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: movq %r8, %rdx
; X86-64-NEXT: retq
;
; WIN64-LABEL: udiv_magic_preshift_and_postshift:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $202, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __udivti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq %rcx, %r9
+; WIN64-NEXT: movabsq $-6757718126012409998, %r11 # imm = 0xA237C32B16CFD772
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %r11
+; WIN64-NEXT: movq %rax, %r10
+; WIN64-NEXT: movq %rdx, %rcx
+; WIN64-NEXT: movabsq $1095846182596607027, %rsi # imm = 0xF353A4C0A237C33
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: mulq %rsi
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: addq %r10, %r9
+; WIN64-NEXT: adcq $0, %rcx
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %r11
+; WIN64-NEXT: movq %rdx, %r10
+; WIN64-NEXT: movq %rax, %r11
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rsi
+; WIN64-NEXT: addq %r9, %rax
+; WIN64-NEXT: adcq %rdx, %rcx
+; WIN64-NEXT: adcq $0, %r10
+; WIN64-NEXT: addq %r11, %rcx
+; WIN64-NEXT: adcq $0, %r10
+; WIN64-NEXT: shrdq $7, %r10, %rcx
+; WIN64-NEXT: shrq $7, %r10
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r10, %rdx
+; WIN64-NEXT: popq %rsi
; WIN64-NEXT: retq
%ret = udiv i128 %x, 202
ret i128 %ret
@@ -1203,27 +1242,81 @@ define i128 @udiv_magic_preshift_and_postshift(i128 %x) nounwind {
define i128 @urem_magic_preshift_and_postshift(i128 %x) nounwind {
; X86-64-LABEL: urem_magic_preshift_and_postshift:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
+; X86-64-NEXT: movabsq $-6757718126012409998, %r10 # imm = 0xA237C32B16CFD772
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r10
+; X86-64-NEXT: movq %rax, %r8
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movabsq $1095846182596607027, %r11 # imm = 0xF353A4C0A237C33
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: movq %rdx, %r9
+; X86-64-NEXT: addq %r8, %r9
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r10
+; X86-64-NEXT: movq %rdx, %r8
+; X86-64-NEXT: movq %rax, %r10
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: addq %r9, %rax
+; X86-64-NEXT: adcq %rdx, %rcx
+; X86-64-NEXT: adcq $0, %r8
+; X86-64-NEXT: addq %r10, %rcx
+; X86-64-NEXT: adcq $0, %r8
+; X86-64-NEXT: shrdq $7, %r8, %rcx
; X86-64-NEXT: movl $202, %edx
-; X86-64-NEXT: xorl %ecx, %ecx
-; X86-64-NEXT: callq __umodti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: mulq %rdx
+; X86-64-NEXT: shrq $7, %r8
+; X86-64-NEXT: imulq $202, %r8, %rcx
+; X86-64-NEXT: addq %rdx, %rcx
+; X86-64-NEXT: subq %rax, %rdi
+; X86-64-NEXT: sbbq %rcx, %rsi
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: movq %rsi, %rdx
; X86-64-NEXT: retq
;
; WIN64-LABEL: urem_magic_preshift_and_postshift:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $202, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __umodti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movabsq $-6757718126012409998, %rsi # imm = 0xA237C32B16CFD772
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rsi
+; WIN64-NEXT: movq %rax, %r10
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movabsq $1095846182596607027, %rdi # imm = 0xF353A4C0A237C33
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: movq %rdx, %r11
+; WIN64-NEXT: addq %r10, %r11
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rsi
+; WIN64-NEXT: movq %rdx, %r10
+; WIN64-NEXT: movq %rax, %rsi
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: addq %r11, %rax
+; WIN64-NEXT: adcq %rdx, %r9
+; WIN64-NEXT: adcq $0, %r10
+; WIN64-NEXT: addq %rsi, %r9
+; WIN64-NEXT: adcq $0, %r10
+; WIN64-NEXT: shrdq $7, %r10, %r9
+; WIN64-NEXT: movl $202, %edx
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: shrq $7, %r10
+; WIN64-NEXT: imulq $202, %r10, %r9
+; WIN64-NEXT: addq %rdx, %r9
+; WIN64-NEXT: subq %rax, %rcx
+; WIN64-NEXT: sbbq %r9, %r8
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r8, %rdx
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
; WIN64-NEXT: retq
%ret = urem i128 %x, 202
ret i128 %ret
@@ -1233,28 +1326,37 @@ define i128 @urem_magic_preshift_and_postshift(i128 %x) nounwind {
define i128 @udiv_magic_large_preshift(i128 %x) nounwind {
; X86-64-LABEL: udiv_magic_large_preshift:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movabsq $755914244096, %rcx # imm = 0xB000000000
+; X86-64-NEXT: shrq $36, %rsi
+; X86-64-NEXT: movabsq $1676976733973595601, %rcx # imm = 0x1745D1745D1745D1
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %rcx
+; X86-64-NEXT: movq %rax, %rcx
+; X86-64-NEXT: movq %rdx, %rdi
+; X86-64-NEXT: movabsq $8384883669867978008, %rdx # imm = 0x745D1745D1745D18
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %rdx
+; X86-64-NEXT: addq %rcx, %rdx
+; X86-64-NEXT: adcq $0, %rdi
+; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: xorl %edx, %edx
-; X86-64-NEXT: callq __udivti3 at PLT
-; X86-64-NEXT: popq %rcx
; X86-64-NEXT: retq
;
; WIN64-LABEL: udiv_magic_large_preshift:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000
-; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __udivti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: movq %rdx, %rcx
+; WIN64-NEXT: shrq $36, %rcx
+; WIN64-NEXT: movabsq $1676976733973595601, %rdx # imm = 0x1745D1745D1745D1
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: movq %rax, %r8
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movabsq $8384883669867978008, %rdx # imm = 0x745D1745D1745D18
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: addq %r8, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
%ret = udiv i128 %x, 13944156602510523416463735259136 ; = 11 * 2^100
ret i128 %ret
@@ -1264,28 +1366,45 @@ define i128 @udiv_magic_large_preshift(i128 %x) nounwind {
define i128 @urem_magic_large_preshift(i128 %x) nounwind {
; X86-64-LABEL: urem_magic_large_preshift:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movabsq $755914244096, %rcx # imm = 0xB000000000
-; X86-64-NEXT: xorl %edx, %edx
-; X86-64-NEXT: callq __umodti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq %rsi, %rcx
+; X86-64-NEXT: shrq $36, %rcx
+; X86-64-NEXT: movabsq $1676976733973595601, %rdx # imm = 0x1745D1745D1745D1
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: mulq %rdx
+; X86-64-NEXT: movq %rax, %r8
+; X86-64-NEXT: movq %rdx, %r9
+; X86-64-NEXT: movabsq $8384883669867978008, %rdx # imm = 0x745D1745D1745D18
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: mulq %rdx
+; X86-64-NEXT: addq %r8, %rdx
+; X86-64-NEXT: adcq $0, %r9
+; X86-64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000
+; X86-64-NEXT: imulq %r9, %rax
+; X86-64-NEXT: subq %rax, %rsi
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: movq %rsi, %rdx
; X86-64-NEXT: retq
;
; WIN64-LABEL: urem_magic_large_preshift:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: shrq $36, %r9
+; WIN64-NEXT: movabsq $1676976733973595601, %rdx # imm = 0x1745D1745D1745D1
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: movq %rax, %r10
+; WIN64-NEXT: movq %rdx, %r11
+; WIN64-NEXT: movabsq $8384883669867978008, %rdx # imm = 0x745D1745D1745D18
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: addq %r10, %rdx
+; WIN64-NEXT: adcq $0, %r11
; WIN64-NEXT: movabsq $755914244096, %rax # imm = 0xB000000000
-; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __umodti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: imulq %r11, %rax
+; WIN64-NEXT: subq %rax, %r8
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r8, %rdx
; WIN64-NEXT: retq
%ret = urem i128 %x, 13944156602510523416463735259136 ; = 11 * 2^100
ret i128 %ret
@@ -1295,27 +1414,39 @@ define i128 @urem_magic_large_preshift(i128 %x) nounwind {
define i128 @udiv_magic_large_postshift(i128 %x) nounwind {
; X86-64-LABEL: udiv_magic_large_postshift:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movl $1, %edx
-; X86-64-NEXT: movl $1, %ecx
-; X86-64-NEXT: callq __udivti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq $-1, %r9
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r9
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movq %rax, %r8
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r9
+; X86-64-NEXT: addq %rsi, %rax
+; X86-64-NEXT: adcq $0, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: addq %r8, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
;
; WIN64-LABEL: udiv_magic_large_postshift:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __udivti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq $-1, %r11
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: mulq %r11
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movq %rax, %r10
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %r11
+; WIN64-NEXT: addq %r8, %rax
+; WIN64-NEXT: adcq $0, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: addq %r10, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: movq %r9, %rax
+; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
%ret = udiv i128 %x, 18446744073709551617 ; = 2^64 + 1
ret i128 %ret
@@ -1325,27 +1456,43 @@ define i128 @udiv_magic_large_postshift(i128 %x) nounwind {
define i128 @urem_magic_large_postshift(i128 %x) nounwind {
; X86-64-LABEL: urem_magic_large_postshift:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movl $1, %edx
-; X86-64-NEXT: movl $1, %ecx
-; X86-64-NEXT: callq __umodti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq $-1, %r9
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r9
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movq %rax, %r8
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r9
+; X86-64-NEXT: addq %rsi, %rax
+; X86-64-NEXT: adcq $0, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: addq %r8, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: subq %rcx, %rdi
+; X86-64-NEXT: sbbq %rcx, %rsi
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: movq %rsi, %rdx
; X86-64-NEXT: retq
;
; WIN64-LABEL: urem_magic_large_postshift:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __umodti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq $-1, %r11
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: mulq %r11
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movq %rax, %r10
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %r11
+; WIN64-NEXT: addq %r8, %rax
+; WIN64-NEXT: adcq $0, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: addq %r10, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: subq %r9, %rcx
+; WIN64-NEXT: sbbq %r9, %r8
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r8, %rdx
; WIN64-NEXT: retq
%ret = urem i128 %x, 18446744073709551617 ; = 2^64 + 1
ret i128 %ret
@@ -1355,28 +1502,77 @@ define i128 @urem_magic_large_postshift(i128 %x) nounwind {
define i128 @udiv_magic_is_add(i128 %x) nounwind {
; X86-64-LABEL: udiv_magic_is_add:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X86-64-NEXT: movl $1, %edx
-; X86-64-NEXT: callq __udivti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq $-1, %r10
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r10
+; X86-64-NEXT: movq %rax, %rcx
+; X86-64-NEXT: movq %rdx, %r8
+; X86-64-NEXT: movq $-3, %r11
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: movq %rdx, %r9
+; X86-64-NEXT: addq %rcx, %r9
+; X86-64-NEXT: adcq $0, %r8
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r10
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movq %rax, %r10
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: addq %r9, %rax
+; X86-64-NEXT: adcq %r8, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: addq %r10, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: subq %rdx, %rdi
+; X86-64-NEXT: sbbq %rcx, %rsi
+; X86-64-NEXT: shrdq $1, %rsi, %rdi
+; X86-64-NEXT: shrq %rsi
+; X86-64-NEXT: addq %rdx, %rdi
+; X86-64-NEXT: adcq %rsi, %rcx
+; X86-64-NEXT: shrq $63, %rcx
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
;
; WIN64-LABEL: udiv_magic_is_add:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __udivti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq $-1, %rsi
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rsi
+; WIN64-NEXT: movq %rax, %r9
+; WIN64-NEXT: movq %rdx, %r10
+; WIN64-NEXT: movq $-3, %rdi
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: movq %rdx, %r11
+; WIN64-NEXT: addq %r9, %r11
+; WIN64-NEXT: adcq $0, %r10
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rsi
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movq %rax, %rsi
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: addq %r11, %rax
+; WIN64-NEXT: adcq %r10, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: addq %rsi, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: subq %rdx, %rcx
+; WIN64-NEXT: sbbq %r9, %r8
+; WIN64-NEXT: shrdq $1, %r8, %rcx
+; WIN64-NEXT: shrq %r8
+; WIN64-NEXT: addq %rdx, %rcx
+; WIN64-NEXT: adcq %r9, %r8
+; WIN64-NEXT: shrq $63, %r8
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: xorl %edx, %edx
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
; WIN64-NEXT: retq
%ret = udiv i128 %x, 170141183460469231731687303715884105729 ; = 2^127 + 1
ret i128 %ret
@@ -1386,28 +1582,89 @@ define i128 @udiv_magic_is_add(i128 %x) nounwind {
define i128 @urem_magic_is_add(i128 %x) nounwind {
; X86-64-LABEL: urem_magic_is_add:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; X86-64-NEXT: movl $1, %edx
-; X86-64-NEXT: callq __umodti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq $-1, %r10
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r10
+; X86-64-NEXT: movq %rax, %rcx
+; X86-64-NEXT: movq %rdx, %r8
+; X86-64-NEXT: movq $-3, %r11
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: movq %rdx, %r9
+; X86-64-NEXT: addq %rcx, %r9
+; X86-64-NEXT: adcq $0, %r8
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r10
+; X86-64-NEXT: movq %rdx, %rcx
+; X86-64-NEXT: movq %rax, %r10
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: mulq %r11
+; X86-64-NEXT: addq %r9, %rax
+; X86-64-NEXT: adcq %r8, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: addq %r10, %rdx
+; X86-64-NEXT: adcq $0, %rcx
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: subq %rdx, %rax
+; X86-64-NEXT: movq %rsi, %r8
+; X86-64-NEXT: sbbq %rcx, %r8
+; X86-64-NEXT: shrdq $1, %r8, %rax
+; X86-64-NEXT: shrq %r8
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: adcq %rcx, %r8
+; X86-64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X86-64-NEXT: andq %r8, %rax
+; X86-64-NEXT: shrq $63, %r8
+; X86-64-NEXT: subq %r8, %rdi
+; X86-64-NEXT: sbbq %rax, %rsi
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: movq %rsi, %rdx
; X86-64-NEXT: retq
;
; WIN64-LABEL: urem_magic_is_add:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: movq $-1, %rsi
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rsi
+; WIN64-NEXT: movq %rax, %r9
+; WIN64-NEXT: movq %rdx, %r10
+; WIN64-NEXT: movq $-3, %rdi
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: movq %rdx, %r11
+; WIN64-NEXT: addq %r9, %r11
+; WIN64-NEXT: adcq $0, %r10
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rsi
+; WIN64-NEXT: movq %rdx, %r9
+; WIN64-NEXT: movq %rax, %rsi
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rdi
+; WIN64-NEXT: addq %r11, %rax
+; WIN64-NEXT: adcq %r10, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: addq %rsi, %rdx
+; WIN64-NEXT: adcq $0, %r9
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: subq %rdx, %rax
+; WIN64-NEXT: movq %r8, %r10
+; WIN64-NEXT: sbbq %r9, %r10
+; WIN64-NEXT: shrdq $1, %r10, %rax
+; WIN64-NEXT: shrq %r10
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: adcq %r9, %r10
; WIN64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __umodti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT: movq %xmm0, %rdx
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: andq %r10, %rax
+; WIN64-NEXT: shrq $63, %r10
+; WIN64-NEXT: subq %r10, %rcx
+; WIN64-NEXT: sbbq %rax, %r8
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: movq %r8, %rdx
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
; WIN64-NEXT: retq
%ret = urem i128 %x, 170141183460469231731687303715884105729 ; = 2^127 + 1
ret i128 %ret
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index 318a48b92cb28..9b845e934b4a0 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -152,40 +152,57 @@ declare i37 @llvm.fshl.i37(i37, i37, i37)
define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind {
; X86-SSE2-LABEL: fshl_i37:
; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: andl $31, %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE2-NEXT: andl $31, %esi
+; X86-SSE2-NEXT: movl $116080197, %edx # imm = 0x6EB3E45
+; X86-SSE2-NEXT: movl %ecx, %eax
+; X86-SSE2-NEXT: mull %edx
+; X86-SSE2-NEXT: movl %eax, %ebx
+; X86-SSE2-NEXT: movl %edx, %edi
+; X86-SSE2-NEXT: movl $812561381, %edx # imm = 0x306EB3E5
+; X86-SSE2-NEXT: movl %ecx, %eax
+; X86-SSE2-NEXT: mull %edx
+; X86-SSE2-NEXT: movl %edx, %ebp
+; X86-SSE2-NEXT: addl %ebx, %ebp
+; X86-SSE2-NEXT: adcl $0, %edi
+; X86-SSE2-NEXT: movl %esi, %eax
+; X86-SSE2-NEXT: movl $812561381, %edx # imm = 0x306EB3E5
+; X86-SSE2-NEXT: mull %edx
+; X86-SSE2-NEXT: addl %ebp, %eax
+; X86-SSE2-NEXT: adcl %edi, %edx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: imull $116080197, %esi, %esi # imm = 0x6EB3E45
+; X86-SSE2-NEXT: addl %edx, %esi
+; X86-SSE2-NEXT: leal (%esi,%esi,8), %edx
+; X86-SSE2-NEXT: leal (%esi,%edx,4), %edx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT: shldl $27, %ebx, %edi
-; X86-SSE2-NEXT: pushl $0
-; X86-SSE2-NEXT: pushl $37
-; X86-SSE2-NEXT: pushl %eax
-; X86-SSE2-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: calll __umoddi3
-; X86-SSE2-NEXT: addl $16, %esp
-; X86-SSE2-NEXT: movl %eax, %ecx
+; X86-SSE2-NEXT: subl %edx, %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT: shldl $27, %edi, %esi
; X86-SSE2-NEXT: testb $32, %cl
; X86-SSE2-NEXT: jne .LBB3_1
; X86-SSE2-NEXT: # %bb.2:
-; X86-SSE2-NEXT: movl %edi, %ebx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE2-NEXT: movl %esi, %edi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT: movl %eax, %esi
; X86-SSE2-NEXT: jmp .LBB3_3
; X86-SSE2-NEXT: .LBB3_1:
-; X86-SSE2-NEXT: shll $27, %ebx
+; X86-SSE2-NEXT: shll $27, %edi
+; X86-SSE2-NEXT: movl %eax, %edx
; X86-SSE2-NEXT: .LBB3_3:
-; X86-SSE2-NEXT: movl %edi, %eax
-; X86-SSE2-NEXT: shldl %cl, %ebx, %eax
+; X86-SSE2-NEXT: movl %esi, %eax
+; X86-SSE2-NEXT: shldl %cl, %edi, %eax
; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT: shldl %cl, %edi, %esi
-; X86-SSE2-NEXT: movl %esi, %edx
+; X86-SSE2-NEXT: shldl %cl, %esi, %edx
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
+; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X64-AVX-LABEL: fshl_i37:
@@ -318,41 +335,58 @@ declare i37 @llvm.fshr.i37(i37, i37, i37)
define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
; X86-SSE2-LABEL: fshr_i37:
; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: andl $31, %eax
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT: shldl $27, %ebx, %esi
-; X86-SSE2-NEXT: pushl $0
-; X86-SSE2-NEXT: pushl $37
-; X86-SSE2-NEXT: pushl %eax
-; X86-SSE2-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: calll __umoddi3
-; X86-SSE2-NEXT: addl $16, %esp
-; X86-SSE2-NEXT: movl %eax, %ecx
-; X86-SSE2-NEXT: addl $27, %ecx
+; X86-SSE2-NEXT: andl $31, %esi
+; X86-SSE2-NEXT: movl $116080197, %edx # imm = 0x6EB3E45
+; X86-SSE2-NEXT: movl %ebp, %eax
+; X86-SSE2-NEXT: mull %edx
+; X86-SSE2-NEXT: movl %eax, %ebx
+; X86-SSE2-NEXT: movl %edx, %edi
+; X86-SSE2-NEXT: movl $812561381, %ecx # imm = 0x306EB3E5
+; X86-SSE2-NEXT: movl %ebp, %eax
+; X86-SSE2-NEXT: mull %ecx
+; X86-SSE2-NEXT: movl %edx, %ebp
+; X86-SSE2-NEXT: addl %ebx, %ebp
+; X86-SSE2-NEXT: adcl $0, %edi
+; X86-SSE2-NEXT: movl %esi, %eax
+; X86-SSE2-NEXT: mull %ecx
+; X86-SSE2-NEXT: movl %edx, %ebx
+; X86-SSE2-NEXT: addl %ebp, %eax
+; X86-SSE2-NEXT: adcl %edi, %ebx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT: imull $116080197, %esi, %eax # imm = 0x6EB3E45
+; X86-SSE2-NEXT: addl %ebx, %eax
+; X86-SSE2-NEXT: leal (%eax,%eax,8), %ecx
+; X86-SSE2-NEXT: leal (%eax,%ecx,4), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE2-NEXT: negl %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: leal 27(%ecx,%eax), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: shldl $27, %edi, %eax
; X86-SSE2-NEXT: testb $32, %cl
; X86-SSE2-NEXT: je .LBB10_1
; X86-SSE2-NEXT: # %bb.2:
-; X86-SSE2-NEXT: movl %edi, %edx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SSE2-NEXT: jmp .LBB10_3
; X86-SSE2-NEXT: .LBB10_1:
-; X86-SSE2-NEXT: shll $27, %ebx
-; X86-SSE2-NEXT: movl %esi, %edx
-; X86-SSE2-NEXT: movl %ebx, %esi
+; X86-SSE2-NEXT: shll $27, %edi
+; X86-SSE2-NEXT: movl %edx, %esi
+; X86-SSE2-NEXT: movl %eax, %edx
+; X86-SSE2-NEXT: movl %edi, %eax
; X86-SSE2-NEXT: .LBB10_3:
-; X86-SSE2-NEXT: shrdl %cl, %edx, %esi
+; X86-SSE2-NEXT: shrdl %cl, %edx, %eax
; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT: shrdl %cl, %edi, %edx
-; X86-SSE2-NEXT: movl %esi, %eax
+; X86-SSE2-NEXT: shrdl %cl, %esi, %edx
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
+; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X64-AVX-LABEL: fshr_i37:
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
index e347c5944c2b9..93c5fb98c685a 100644
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -320,11 +320,19 @@ define i128 @test2(i128 %x) nounwind {
;
; X64-LABEL: test2:
; X64: # %bb.0:
-; X64-NEXT: pushq %rax
+; X64-NEXT: shrq $2, %rsi
+; X64-NEXT: movl $4, %ecx
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movl $17, %edx
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %rdx
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: movq $-4, %rcx
-; X64-NEXT: callq __udivti3 at PLT
-; X64-NEXT: popq %rcx
; X64-NEXT: retq
%tmp = udiv i128 %x, -73786976294838206464
ret i128 %tmp
@@ -631,11 +639,31 @@ define i128 @test3(i128 %x) nounwind {
;
; X64-LABEL: test3:
; X64: # %bb.0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movq $-3, %rdx
-; X64-NEXT: movq $-5, %rcx
-; X64-NEXT: callq __udivti3 at PLT
-; X64-NEXT: popq %rcx
+; X64-NEXT: movabsq $4611686018427387905, %r9 # imm = 0x4000000000000001
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movl $5, %r10d
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r10
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: addq %rcx, %rdi
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r10
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: adcq %r8, %rdx
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: addq %r9, %rdx
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: shrq $62, %rcx
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
%tmp = udiv i128 %x, -73786976294838206467
ret i128 %tmp
@@ -2907,11 +2935,32 @@ define i128 @div_by_67(i128 %x) nounwind {
;
; X64-LABEL: div_by_67:
; X64: # %bb.0: # %entry
-; X64-NEXT: pushq %rax
-; X64-NEXT: movl $67, %edx
-; X64-NEXT: xorl %ecx, %ecx
-; X64-NEXT: callq __udivti3 at PLT
-; X64-NEXT: popq %rcx
+; X64-NEXT: movabsq $-825973615240726192, %r9 # imm = 0xF4898D5F85BB3950
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movabsq $4405192614617206357, %r10 # imm = 0x3D226357E16ECE55
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r10
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: addq %r8, %rdi
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r10
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: adcq %rdx, %rcx
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: addq %r9, %rcx
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: shrdq $6, %r8, %rcx
+; X64-NEXT: shrq $6, %r8
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %r8, %rdx
; X64-NEXT: retq
entry:
%div = udiv i128 %x, 67
More information about the llvm-commits
mailing list