[llvm] [CodeGen] Emit a more efficient magic number multiplication for exact udivs (PR #87161)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 10 06:57:38 PDT 2024
https://github.com/AtariDreams updated https://github.com/llvm/llvm-project/pull/87161
>From 318984b839feeedcaee4bdbeff4067a04c5f0759 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Fri, 3 May 2024 20:50:49 -0400
Subject: [PATCH 1/2] [CodeGen] Pre-commit tests (NFC)
---
.../AArch64/GlobalISel/combine-udiv.ll | 25 ++
.../AArch64/GlobalISel/combine-udiv.mir | 123 ++++++++++
llvm/test/CodeGen/X86/udiv-exact.ll | 225 ++++++++++++++++++
3 files changed, 373 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/udiv-exact.ll
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
index c97a00ccdd455..d0fb68f08fff9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
@@ -269,3 +269,28 @@ define i32 @udiv_div_by_180(i32 %x)
%udiv = udiv i32 %truncate, 180
ret i32 %udiv
}
+
+define i32 @udiv_div_by_180_exact(i32 %x)
+; SDAG-LABEL: udiv_div_by_180_exact:
+; SDAG: // %bb.0:
+; SDAG-NEXT: lsr w8, w0, #2
+; SDAG-NEXT: mov w9, #27671 // =0x6c17
+; SDAG-NEXT: movk w9, #5825, lsl #16
+; SDAG-NEXT: umull x8, w8, w9
+; SDAG-NEXT: lsr x0, x8, #34
+; SDAG-NEXT: // kill: def $w0 killed $w0 killed $x0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: udiv_div_by_180_exact:
+; GISEL: // %bb.0:
+; GISEL-NEXT: lsr w8, w0, #2
+; GISEL-NEXT: mov w9, #27671 // =0x6c17
+; GISEL-NEXT: movk w9, #5825, lsl #16
+; GISEL-NEXT: umull x8, w8, w9
+; GISEL-NEXT: lsr x8, x8, #32
+; GISEL-NEXT: lsr w0, w8, #2
+; GISEL-NEXT: ret
+{
+ %udiv = udiv exact i32 %x, 180
+ ret i32 %udiv
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
index 02233b9f498bd..539152417e01f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
@@ -304,5 +304,128 @@ body: |
%10:_(<8 x s16>) = G_UDIV %0, %1
$q0 = COPY %10(<8 x s16>)
RET_ReallyLR implicit $q0
+...
+---
+name: udiv_exact
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: udiv_exact
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]]
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 104
+ %2:_(s32) = exact G_UDIV %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: udiv_noexact
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: udiv_noexact
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]]
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 104
+ %2:_(s32) = G_UDIV %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: udiv_exact_minsize
+body: |
+ bb.1:
+ liveins: $w0
+
+ ; CHECK-LABEL: name: udiv_exact_minsize
+ ; CHECK: liveins: $w0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]]
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = G_CONSTANT i32 104
+ %2:_(s32) = exact G_UDIV %0, %1
+ $w0 = COPY %2(s32)
+ RET_ReallyLR implicit $w0
+
+...
+---
+name: div_v4s32
+body: |
+ bb.1:
+ liveins: $q0
+
+ ; CHECK-LABEL: name: div_v4s32
+ ; CHECK: liveins: $q0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 954437177
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C2]](s32), [[C]](s32), [[C2]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C3]](s32), [[C1]](s32), [[C3]](s32)
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<4 x s32>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<4 x s32>)
+ ; CHECK-NEXT: $q0 = COPY [[LSHR]](<4 x s32>)
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:_(<4 x s32>) = COPY $q0
+ %c1:_(s32) = G_CONSTANT i32 104
+ %c2:_(s32) = G_CONSTANT i32 72
+ %1:_(<4 x s32>) = G_BUILD_VECTOR %c1(s32), %c2(s32), %c1(s32), %c2(s32)
+ %3:_(<4 x s32>) = exact G_UDIV %0, %1
+ $q0 = COPY %3(<4 x s32>)
+ RET_ReallyLR implicit $q0
+
+...
+---
+name: div_v4s32_splat
+body: |
+ bb.1:
+ liveins: $q0
+
+ ; CHECK-LABEL: name: div_v4s32_splat
+ ; CHECK: liveins: $q0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32)
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<4 x s32>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<4 x s32>)
+ ; CHECK-NEXT: $q0 = COPY [[LSHR]](<4 x s32>)
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:_(<4 x s32>) = COPY $q0
+ %c1:_(s32) = G_CONSTANT i32 104
+ %1:_(<4 x s32>) = G_BUILD_VECTOR %c1(s32), %c1(s32), %c1(s32), %c1(s32)
+ %3:_(<4 x s32>) = exact G_UDIV %0, %1
+ $q0 = COPY %3(<4 x s32>)
+ RET_ReallyLR implicit $q0
...
diff --git a/llvm/test/CodeGen/X86/udiv-exact.ll b/llvm/test/CodeGen/X86/udiv-exact.ll
new file mode 100644
index 0000000000000..0a835e0710788
--- /dev/null
+++ b/llvm/test/CodeGen/X86/udiv-exact.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
+
+define i32 @test1(i32 %x) {
+; X86-LABEL: test1:
+; X86: # %bb.0:
+; X86-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: imulq $1374389535, %rax, %rax # imm = 0x51EB851F
+; X64-NEXT: shrq $35, %rax
+; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: retq
+ %div = udiv exact i32 %x, 25
+ ret i32 %div
+}
+
+define i32 @test2(i32 %x) {
+; X86-LABEL: test2:
+; X86: # %bb.0:
+; X86-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $4, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: movl $2863311531, %eax # imm = 0xAAAAAAAB
+; X64-NEXT: imulq %rcx, %rax
+; X64-NEXT: shrq $36, %rax
+; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: retq
+ %div = udiv exact i32 %x, 24
+ ret i32 %div
+}
+
+define <4 x i32> @test3(<4 x i32> %x) {
+; X86-LABEL: test3:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: psrld $4, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # %bb.0:
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
+; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; X64-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; X64-NEXT: vpsrld $4, %xmm0, %xmm0
+; X64-NEXT: retq
+ %div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 24, i32 24>
+ ret <4 x i32> %div
+}
+
+define <4 x i32> @test4(<4 x i32> %x) {
+; X86-LABEL: test4:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: psrld $3, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: test4:
+; X64: # %bb.0:
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; X64-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; X64-NEXT: vpsrld $3, %xmm0, %xmm0
+; X64-NEXT: retq
+ %div = udiv exact <4 x i32> %x, <i32 25, i32 25, i32 25, i32 25>
+ ret <4 x i32> %div
+}
+
+define <4 x i32> @test5(<4 x i32> %x) {
+; X86-LABEL: test5:
+; X86: # %bb.0:
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrld $4, %xmm1
+; X86-NEXT: psrld $3, %xmm0
+; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test5:
+; X64: # %bb.0:
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
+ %div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 25, i32 25>
+ ret <4 x i32> %div
+}
+
+define <4 x i32> @test6(<4 x i32> %x) {
+; X86-LABEL: test6:
+; X86: # %bb.0:
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrld $4, %xmm1
+; X86-NEXT: psrld $3, %xmm0
+; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test6:
+; X64: # %bb.0:
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
+ %div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 26, i32 26>
+ ret <4 x i32> %div
+}
+
+define <4 x i32> @test7(<4 x i32> %x) {
+; X86-LABEL: test7:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,795364315,795364315]
+; X86-NEXT: pmuludq %xmm0, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-NEXT: psubd %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X86-NEXT: paddd %xmm1, %xmm0
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrld $3, %xmm1
+; X86-NEXT: psrld $4, %xmm0
+; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test7:
+; X64: # %bb.0:
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; X64-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
+ %div = udiv exact <4 x i32> %x, <i32 25, i32 25, i32 27, i32 27>
+ ret <4 x i32> %div
+}
+
+define <4 x i32> @test8(<4 x i32> %x) {
+; X86-LABEL: test8:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [u,u,2863311531,2863311531]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq %xmm1, %xmm3
+; X86-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X86-NEXT: psrld $4, %xmm2
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; X86-NEXT: retl
+;
+; X64-LABEL: test8:
+; X64: # %bb.0:
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; X64-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; X64-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X64-NEXT: vpsrld $4, %xmm1, %xmm1
+; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; X64-NEXT: retq
+ %div = udiv exact <4 x i32> %x, <i32 1, i32 1, i32 24, i32 24>
+ ret <4 x i32> %div
+}
>From 3a9ac8c6c0d3e439514f2c3ffc11168bdf70f556 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Sat, 30 Mar 2024 12:24:46 -0400
Subject: [PATCH 2/2] [CodeGen] Emit a more efficient magic number
multiplication for exact udivs
Have simpler lowering for exact udivs in both SelectionDAG and GlobalISel.
---
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 53 +++++-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 69 +++++++-
.../AArch64/GlobalISel/combine-udiv.ll | 16 +-
.../AArch64/GlobalISel/combine-udiv.mir | 45 +++--
llvm/test/CodeGen/X86/udiv-exact.ll | 166 ++++++------------
5 files changed, 202 insertions(+), 147 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 02d85958fc7be..b6bd715e49176 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5185,8 +5185,35 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
KB ? KB->getKnownBits(LHS).countMinLeadingZeros() : 0;
auto &MIB = Builder;
+ bool UseSRL = false;
bool UseNPQ = false;
SmallVector<Register, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
+ SmallVector<Register, 16> Shifts, Factors;
+ auto *RHSDefInstr = cast<GenericMachineInstr>(getDefIgnoringCopies(RHS, MRI));
+ bool IsSplat = getIConstantSplatVal(*RHSDefInstr, MRI).has_value();
+
+ auto BuildExactUDIVPattern = [&](const Constant *C) {
+ // Don't recompute inverses for each splat element.
+ if (IsSplat && !Factors.empty()) {
+ Shifts.push_back(Shifts[0]);
+ Factors.push_back(Factors[0]);
+ return true;
+ }
+
+ auto *CI = cast<ConstantInt>(C);
+ APInt Divisor = CI->getValue();
+ unsigned Shift = Divisor.countr_zero();
+ if (Shift) {
+ Divisor.lshrInPlace(Shift);
+ UseSRL = true;
+ }
+
+ // Calculate the multiplicative inverse modulo BW.
+ APInt Factor = Divisor.multiplicativeInverse();
+ Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0));
+ Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0));
+ return true;
+ };
auto BuildUDIVPattern = [&](const Constant *C) {
auto *CI = cast<ConstantInt>(C);
@@ -5233,6 +5260,29 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
return true;
};
+ if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+ // Collect all magic values from the build vector.
+ bool Matched = matchUnaryPredicate(MRI, RHS, BuildExactUDIVPattern);
+ (void)Matched;
+ assert(Matched && "Expected unary predicate match to succeed");
+
+ Register Shift, Factor;
+ if (Ty.isVector()) {
+ Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0);
+ Factor = MIB.buildBuildVector(Ty, Factors).getReg(0);
+ } else {
+ Shift = Shifts[0];
+ Factor = Factors[0];
+ }
+
+ Register Res = LHS;
+
+ if (UseSRL)
+ Res = MIB.buildLShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0);
+
+ return MIB.buildMul(Ty, Res, Factor);
+ }
+
// Collect the shifts/magic values from each element.
bool Matched = matchUnaryPredicate(MRI, RHS, BuildUDIVPattern);
(void)Matched;
@@ -5286,7 +5336,8 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) {
Register RHS = MI.getOperand(2).getReg();
LLT DstTy = MRI.getType(Dst);
auto *RHSDef = MRI.getVRegDef(RHS);
- if (!isConstantOrConstantVector(*RHSDef, MRI))
+ if (!MI.getFlag(MachineInstr::MIFlag::IsExact) &&
+ !isConstantOrConstantVector(*RHSDef, MRI))
return false;
auto &MF = *MI.getMF();
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c8a5e15257608..cb13d881b5b17 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6128,10 +6128,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
}
SDValue Res = Op0;
-
- // Shift the value upfront if it is even, so the LSB is one.
if (UseSRA) {
- // TODO: For UDIV use SRL instead of SRA.
SDNodeFlags Flags;
Flags.setExact(true);
Res = DAG.getNode(ISD::SRA, dl, VT, Res, Shift, Flags);
@@ -6141,6 +6138,68 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
return DAG.getNode(ISD::MUL, dl, VT, Res, Factor);
}
+/// Given an exact UDIV by a constant, create a multiplication
+/// with the multiplicative inverse of the constant.
+static SDValue BuildExactUDIV(const TargetLowering &TLI, SDNode *N,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+ EVT ShSVT = ShVT.getScalarType();
+
+ bool UseSRL = false;
+ SmallVector<SDValue, 16> Shifts, Factors;
+
+ auto BuildUDIVPattern = [&](ConstantSDNode *C) {
+ if (C->isZero())
+ return false;
+ APInt Divisor = C->getAPIntValue();
+ unsigned Shift = Divisor.countr_zero();
+ if (Shift) {
+ Divisor.lshrInPlace(Shift);
+ UseSRL = true;
+ }
+ // Calculate the multiplicative inverse modulo BW.
+ APInt Factor = Divisor.multiplicativeInverse();
+ Shifts.push_back(DAG.getConstant(Shift, dl, ShSVT));
+ Factors.push_back(DAG.getConstant(Factor, dl, SVT));
+ return true;
+ };
+
+ // Collect all magic values from the build vector.
+ if (!ISD::matchUnaryPredicate(Op1, BuildUDIVPattern))
+ return SDValue();
+
+ SDValue Shift, Factor;
+ if (Op1.getOpcode() == ISD::BUILD_VECTOR) {
+ Shift = DAG.getBuildVector(ShVT, dl, Shifts);
+ Factor = DAG.getBuildVector(VT, dl, Factors);
+ } else if (Op1.getOpcode() == ISD::SPLAT_VECTOR) {
+ assert(Shifts.size() == 1 && Factors.size() == 1 &&
+ "Expected matchUnaryPredicate to return one element for scalable "
+ "vectors");
+ Shift = DAG.getSplatVector(ShVT, dl, Shifts[0]);
+ Factor = DAG.getSplatVector(VT, dl, Factors[0]);
+ } else {
+ assert(isa<ConstantSDNode>(Op1) && "Expected a constant");
+ Shift = Shifts[0];
+ Factor = Factors[0];
+ }
+
+ SDValue Res = Op0;
+ if (UseSRL) {
+ SDNodeFlags Flags;
+ Flags.setExact(true);
+ Res = DAG.getNode(ISD::SRL, dl, VT, Res, Shift, Flags);
+ Created.push_back(Res.getNode());
+ }
+
+ return DAG.getNode(ISD::MUL, dl, VT, Res, Factor);
+}
+
SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const {
@@ -6400,6 +6459,10 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+ // If the udiv has an 'exact' bit we can use a simpler lowering.
+ if (N->getFlags().hasExact())
+ return BuildExactUDIV(*this, N, dl, DAG, Created);
+
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
index d0fb68f08fff9..1079d91f8bb80 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
@@ -274,21 +274,17 @@ define i32 @udiv_div_by_180_exact(i32 %x)
; SDAG-LABEL: udiv_div_by_180_exact:
; SDAG: // %bb.0:
; SDAG-NEXT: lsr w8, w0, #2
-; SDAG-NEXT: mov w9, #27671 // =0x6c17
-; SDAG-NEXT: movk w9, #5825, lsl #16
-; SDAG-NEXT: umull x8, w8, w9
-; SDAG-NEXT: lsr x0, x8, #34
-; SDAG-NEXT: // kill: def $w0 killed $w0 killed $x0
+; SDAG-NEXT: mov w9, #20389 // =0x4fa5
+; SDAG-NEXT: movk w9, #42234, lsl #16
+; SDAG-NEXT: mul w0, w8, w9
; SDAG-NEXT: ret
;
; GISEL-LABEL: udiv_div_by_180_exact:
; GISEL: // %bb.0:
; GISEL-NEXT: lsr w8, w0, #2
-; GISEL-NEXT: mov w9, #27671 // =0x6c17
-; GISEL-NEXT: movk w9, #5825, lsl #16
-; GISEL-NEXT: umull x8, w8, w9
-; GISEL-NEXT: lsr x8, x8, #32
-; GISEL-NEXT: lsr w0, w8, #2
+; GISEL-NEXT: mov w9, #20389 // =0x4fa5
+; GISEL-NEXT: movk w9, #42234, lsl #16
+; GISEL-NEXT: mul w0, w8, w9
; GISEL-NEXT: ret
{
%udiv = udiv exact i32 %x, 180
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
index 539152417e01f..f8578a694e2d4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
@@ -315,11 +315,11 @@ body: |
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
- ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32)
- ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = exact G_LSHR [[COPY]], [[C]](s32)
+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[LSHR]], [[C1]]
+ ; CHECK-NEXT: $w0 = COPY [[MUL]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:_(s32) = COPY $w0
%1:_(s32) = G_CONSTANT i32 104
@@ -361,11 +361,11 @@ body: |
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
- ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32)
- ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = exact G_LSHR [[COPY]], [[C]](s32)
+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[LSHR]], [[C1]]
+ ; CHECK-NEXT: $w0 = COPY [[MUL]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:_(s32) = COPY $w0
%1:_(s32) = G_CONSTANT i32 104
@@ -384,15 +384,14 @@ body: |
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 954437177
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C2]](s32), [[C]](s32), [[C2]](s32)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C3]](s32), [[C1]](s32), [[C3]](s32)
- ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<4 x s32>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<4 x s32>)
- ; CHECK-NEXT: $q0 = COPY [[LSHR]](<4 x s32>)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32), [[C1]](s32), [[C2]](s32)
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = exact G_LSHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>)
+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[LSHR]], [[BUILD_VECTOR1]]
+ ; CHECK-NEXT: $q0 = COPY [[MUL]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<4 x s32>) = COPY $q0
%c1:_(s32) = G_CONSTANT i32 104
@@ -413,13 +412,13 @@ body: |
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32)
- ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<4 x s32>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<4 x s32>)
- ; CHECK-NEXT: $q0 = COPY [[LSHR]](<4 x s32>)
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = exact G_LSHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>)
+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[LSHR]], [[BUILD_VECTOR1]]
+ ; CHECK-NEXT: $q0 = COPY [[MUL]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<4 x s32>) = COPY $q0
%c1:_(s32) = G_CONSTANT i32 104
diff --git a/llvm/test/CodeGen/X86/udiv-exact.ll b/llvm/test/CodeGen/X86/udiv-exact.ll
index 0a835e0710788..271d11edff9a7 100644
--- a/llvm/test/CodeGen/X86/udiv-exact.ll
+++ b/llvm/test/CodeGen/X86/udiv-exact.ll
@@ -5,18 +5,12 @@
define i32 @test1(i32 %x) {
; X86-LABEL: test1:
; X86: # %bb.0:
-; X86-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl $3, %eax
+; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %eax # imm = 0xC28F5C29
; X86-NEXT: retl
;
; X64-LABEL: test1:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: imulq $1374389535, %rax, %rax # imm = 0x51EB851F
-; X64-NEXT: shrq $35, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: imull $-1030792151, %edi, %eax # imm = 0xC28F5C29
; X64-NEXT: retq
%div = udiv exact i32 %x, 25
ret i32 %div
@@ -25,19 +19,15 @@ define i32 @test1(i32 %x) {
define i32 @test2(i32 %x) {
; X86-LABEL: test2:
; X86: # %bb.0:
-; X86-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl $4, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
; X86-NEXT: retl
;
; X64-LABEL: test2:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: movl $2863311531, %eax # imm = 0xAAAAAAAB
-; X64-NEXT: imulq %rcx, %rax
-; X64-NEXT: shrq $36, %rax
-; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: shrl $3, %edi
+; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB
; X64-NEXT: retq
%div = udiv exact i32 %x, 24
ret i32 %div
@@ -46,25 +36,21 @@ define i32 @test2(i32 %x) {
define <4 x i32> @test3(<4 x i32> %x) {
; X86-LABEL: test3:
; X86: # %bb.0:
+; X86-NEXT: psrld $3, %xmm0
; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X86-NEXT: pmuludq %xmm1, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: psrld $4, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test3:
; X64: # %bb.0:
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; X64-NEXT: vpsrld $4, %xmm0, %xmm0
+; X64-NEXT: vpsrld $3, %xmm0, %xmm0
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 24, i32 24>
ret <4 x i32> %div
@@ -73,25 +59,19 @@ define <4 x i32> @test3(<4 x i32> %x) {
define <4 x i32> @test4(<4 x i32> %x) {
; X86-LABEL: test4:
; X86: # %bb.0:
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X86-NEXT: pmuludq %xmm1, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: psrld $3, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test4:
; X64: # %bb.0:
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; X64-NEXT: vpsrld $3, %xmm0, %xmm0
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 25, i32 25, i32 25, i32 25>
ret <4 x i32> %div
@@ -100,26 +80,22 @@ define <4 x i32> @test4(<4 x i32> %x) {
define <4 x i32> @test5(<4 x i32> %x) {
; X86-LABEL: test5:
; X86: # %bb.0:
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrld $3, %xmm1
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; X86-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,3264175145,3264175145]
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: movdqa %xmm0, %xmm1
-; X86-NEXT: psrld $4, %xmm1
-; X86-NEXT: psrld $3, %xmm0
-; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X86-NEXT: retl
;
; X64-LABEL: test5:
; X64: # %bb.0:
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 25, i32 25>
ret <4 x i32> %div
@@ -128,26 +104,24 @@ define <4 x i32> @test5(<4 x i32> %x) {
define <4 x i32> @test6(<4 x i32> %x) {
; X86-LABEL: test6:
; X86: # %bb.0:
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: movdqa %xmm0, %xmm1
-; X86-NEXT: psrld $4, %xmm1
-; X86-NEXT: psrld $3, %xmm0
+; X86-NEXT: psrld $3, %xmm1
+; X86-NEXT: psrld $1, %xmm0
; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3303820997,3303820997]
+; X86-NEXT: pmuludq %xmm0, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT: movdqa %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test6:
; X64: # %bb.0:
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 26, i32 26>
ret <4 x i32> %div
@@ -156,42 +130,17 @@ define <4 x i32> @test6(<4 x i32> %x) {
define <4 x i32> @test7(<4 x i32> %x) {
; X86-LABEL: test7:
; X86: # %bb.0:
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,795364315,795364315]
-; X86-NEXT: pmuludq %xmm0, %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-NEXT: psubd %xmm1, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X86-NEXT: paddd %xmm1, %xmm0
-; X86-NEXT: movdqa %xmm0, %xmm1
-; X86-NEXT: psrld $3, %xmm1
-; X86-NEXT: psrld $4, %xmm0
-; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
;
; X64-LABEL: test7:
; X64: # %bb.0:
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X64-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; X64-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; X64-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 25, i32 25, i32 27, i32 27>
ret <4 x i32> %div
@@ -200,25 +149,22 @@ define <4 x i32> @test7(<4 x i32> %x) {
define <4 x i32> @test8(<4 x i32> %x) {
; X86-LABEL: test8:
; X86: # %bb.0:
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [u,u,2863311531,2863311531]
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pmuludq %xmm1, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq %xmm1, %xmm3
-; X86-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; X86-NEXT: psrld $4, %xmm2
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrld $3, %xmm1
+; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X86-NEXT: movdqa {{.*#+}} xmm0 = [1,1,2863311531,2863311531]
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
;
; X64-LABEL: test8:
; X64: # %bb.0:
-; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
-; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
-; X64-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X64-NEXT: vpsrld $4, %xmm1, %xmm1
-; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 1, i32 1, i32 24, i32 24>
ret <4 x i32> %div
More information about the llvm-commits
mailing list