[llvm] r342114 - [X86] Type legalize v2i32 div/rem by scalarizing rather than promoting
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 12 23:13:38 PDT 2018
Author: ctopper
Date: Wed Sep 12 23:13:37 2018
New Revision: 342114
URL: http://llvm.org/viewvc/llvm-project?rev=342114&view=rev
Log:
[X86] Type legalize v2i32 div/rem by scalarizing rather than promoting
Summary:
Previously we type legalized v2i32 div/rem by promoting to v2i64. But we don't support div/rem of vectors so op legalization would then scalarize it using i64 scalar ops since it doesn't know about the original promotion. 64-bit scalar divides on Intel hardware are known to be slow and in 32-bit mode they require a libcall.
This patch switches type legalization to do the scalarizing itself using i32.
It looks like the division by power of 2 optimization is still kicking in and leaving the code as a vector. The division by other constant optimization doesn't kick in pre type legalization since it ignores illegal types. And previously, after type legalization we scalarized the v2i64 since we don't have v2i64 MULHS/MULHU support.
Another option might be to widen v2i32 to v4i32 so we could do division by constant optimizations, but we'd have to be careful to only do that for constant divisors or we risk scalaring to 4 scalar divides.
Reviewers: RKSimon, spatel
Reviewed By: spatel
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D51325
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll
llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=342114&r1=342113&r2=342114&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Sep 12 23:13:37 2018
@@ -779,6 +779,11 @@ X86TargetLowering::X86TargetLowering(con
addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
+ setOperationAction(ISD::SDIV, MVT::v2i32, Custom);
+ setOperationAction(ISD::SREM, MVT::v2i32, Custom);
+ setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
+ setOperationAction(ISD::UREM, MVT::v2i32, Custom);
+
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
@@ -25828,6 +25833,18 @@ void X86TargetLowering::ReplaceNodeResul
case ISD::UDIV:
case ISD::SREM:
case ISD::UREM:
+ if (N->getValueType(0) == MVT::v2i32) {
+ // If we're already legalizing via widening, we don't need this since
+ // that will scalarize div/rem.
+ if (getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector)
+ return;
+ // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the
+ // v2i64 and unroll later. But then we create i64 scalar ops which
+ // might be slow in 64-bit mode or require a libcall in 32-bit mode.
+ Results.push_back(DAG.UnrollVectorOp(N));
+ return;
+ }
+ LLVM_FALLTHROUGH;
case ISD::SDIVREM:
case ISD::UDIVREM: {
SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
Modified: llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll?rev=342114&r1=342113&r2=342114&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll (original)
+++ llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll Wed Sep 12 23:13:37 2018
@@ -13,20 +13,19 @@ define void @vectorDiv (<2 x i32> addrsp
; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT: pmovsxdq (%rdi,%rcx,8), %xmm0
-; CHECK-NEXT: pmovsxdq (%rsi,%rcx,8), %xmm1
-; CHECK-NEXT: pextrq $1, %xmm0, %rax
-; CHECK-NEXT: pextrq $1, %xmm1, %rsi
-; CHECK-NEXT: cqto
-; CHECK-NEXT: idivq %rsi
-; CHECK-NEXT: movq %rax, %xmm2
-; CHECK-NEXT: movq %xmm0, %rax
-; CHECK-NEXT: movq %xmm1, %rsi
-; CHECK-NEXT: cqto
-; CHECK-NEXT: idivq %rsi
-; CHECK-NEXT: movq %rax, %xmm0
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: pextrd $1, %xmm0, %eax
+; CHECK-NEXT: pextrd $1, %xmm1, %esi
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %esi
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %edi
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %edi
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: pinsrd $1, %esi, %xmm0
; CHECK-NEXT: movq %xmm0, (%r8,%rcx,8)
; CHECK-NEXT: retq
entry:
Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll?rev=342114&r1=342113&r2=342114&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll Wed Sep 12 23:13:37 2018
@@ -8,60 +8,58 @@ define void @test_udiv7_v2i32(<2 x i32>*
; X64-LABEL: test_udiv7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm1, %eax
+; X64-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; X64-NEXT: shrq $32, %rcx
+; X64-NEXT: subl %ecx, %eax
+; X64-NEXT: shrl %eax
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: shrl $2, %eax
+; X64-NEXT: movd %xmm0, %ecx
+; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: subl %edx, %ecx
+; X64-NEXT: shrl %ecx
+; X64-NEXT: addl %edx, %ecx
+; X64-NEXT: shrl $2, %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: movd %eax, %xmm1
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movq %xmm0, %rcx
-; X64-NEXT: movabsq $2635249153387078803, %rdi # imm = 0x2492492492492493
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: subq %rdx, %rcx
-; X64-NEXT: shrq %rcx
-; X64-NEXT: addq %rdx, %rcx
-; X64-NEXT: shrq $2, %rcx
-; X64-NEXT: movq %rcx, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: movq %xmm0, %rcx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: subq %rdx, %rcx
-; X64-NEXT: shrq %rcx
-; X64-NEXT: addq %rdx, %rcx
-; X64-NEXT: shrq $2, %rcx
-; X64-NEXT: movq %rcx, %xmm0
-; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_udiv7_v2i32:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $40, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $7, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movd %xmm0, %ecx
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $7, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%esi)
-; X86-NEXT: addl $40, %esp
+; X86-NEXT: movd %xmm0, %esi
+; X86-NEXT: movl $613566757, %ebx # imm = 0x24924925
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: subl %edx, %esi
+; X86-NEXT: shrl %esi
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: shrl $2, %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: shrl %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: shrl $2, %ecx
+; X86-NEXT: movd %ecx, %xmm0
+; X86-NEXT: movd %esi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%edi)
; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_udiv7_v2i32:
@@ -126,68 +124,76 @@ define void @test_urem7_v2i32(<2 x i32>*
; X64-LABEL: test_urem7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm1, %ecx
+; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: subl %edx, %eax
+; X64-NEXT: shrl %eax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: shrl $2, %eax
+; X64-NEXT: leal (,%rax,8), %edx
+; X64-NEXT: subl %edx, %eax
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: movd %xmm0, %ecx
+; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: movl %ecx, %edi
+; X64-NEXT: subl %edx, %edi
+; X64-NEXT: shrl %edi
+; X64-NEXT: addl %edx, %edi
+; X64-NEXT: shrl $2, %edi
+; X64-NEXT: leal (,%rdi,8), %edx
+; X64-NEXT: subl %edx, %edi
+; X64-NEXT: addl %ecx, %edi
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: movd %eax, %xmm1
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movq %xmm0, %rcx
-; X64-NEXT: movabsq $2635249153387078803, %rdi # imm = 0x2492492492492493
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: subq %rdx, %rax
-; X64-NEXT: shrq %rax
-; X64-NEXT: addq %rdx, %rax
-; X64-NEXT: shrq $2, %rax
-; X64-NEXT: leaq (,%rax,8), %rdx
-; X64-NEXT: subq %rdx, %rax
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movq %rax, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: movq %xmm0, %rcx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: subq %rdx, %rax
-; X64-NEXT: shrq %rax
-; X64-NEXT: addq %rdx, %rax
-; X64-NEXT: shrq $2, %rax
-; X64-NEXT: leaq (,%rax,8), %rdx
-; X64-NEXT: subq %rdx, %rax
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movq %rax, %xmm0
-; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_urem7_v2i32:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $40, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $7, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movd %xmm0, %ecx
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $7, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: calll __umoddi3
+; X86-NEXT: movd %xmm0, %esi
+; X86-NEXT: movl $613566757, %edi # imm = 0x24924925
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: shrl %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: shrl $2, %ebx
+; X86-NEXT: leal (,%ebx,8), %eax
+; X86-NEXT: subl %eax, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: subl %edx, %eax
+; X86-NEXT: shrl %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: shrl $2, %eax
+; X86-NEXT: leal (,%eax,8), %edx
+; X86-NEXT: subl %edx, %eax
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%esi)
-; X86-NEXT: addl $40, %esp
+; X86-NEXT: movd %ebx, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%ebp)
; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_urem7_v2i32:
@@ -269,64 +275,68 @@ define void @test_urem7_v2i32(<2 x i32>*
define void @test_sdiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
; X64-LABEL: test_sdiv7_v2i32:
; X64: # %bb.0:
-; X64-NEXT: movslq (%rdi), %rcx
-; X64-NEXT: movslq 4(%rdi), %rax
-; X64-NEXT: movabsq $5270498306774157605, %rdi # imm = 0x4924924924924925
-; X64-NEXT: imulq %rdi
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $63, %rax
-; X64-NEXT: sarq %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: movq %rdx, %xmm0
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: imulq %rdi
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $63, %rax
-; X64-NEXT: sarq %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: movq %rdx, %xmm1
-; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm1, %eax
+; X64-NEXT: cltq
+; X64-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; X64-NEXT: shrq $32, %rcx
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: shrl $31, %ecx
+; X64-NEXT: sarl $2, %eax
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: movd %xmm0, %ecx
+; X64-NEXT: movslq %ecx, %rcx
+; X64-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: addl %edx, %ecx
+; X64-NEXT: movl %ecx, %edx
+; X64-NEXT: shrl $31, %edx
+; X64-NEXT: sarl $2, %ecx
+; X64-NEXT: addl %edx, %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_sdiv7_v2i32:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edi
-; X86-NEXT: movl 4(%eax), %eax
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: sarl $31, %ebx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $7
-; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __divdi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $7
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: calll __divdi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu (%esp), %xmm1 # 16-byte Reload
-; X86-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%esi)
-; X86-NEXT: addl $16, %esp
+; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: movd %xmm0, %esi
+; X86-NEXT: movl $-1840700269, %ebp # imm = 0x92492493
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: imull %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $2, %edi
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %ebp
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $2, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: movd %edx, %xmm0
+; X86-NEXT: movd %edi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%ebx)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_sdiv7_v2i32:
@@ -399,71 +409,80 @@ define void @test_sdiv7_v2i32(<2 x i32>*
define void @test_srem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
; X64-LABEL: test_srem7_v2i32:
; X64: # %bb.0:
-; X64-NEXT: movslq (%rdi), %rcx
-; X64-NEXT: movslq 4(%rdi), %rdi
-; X64-NEXT: movabsq $5270498306774157605, %r8 # imm = 0x4924924924924925
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: imulq %r8
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $63, %rax
-; X64-NEXT: sarq %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: leaq (,%rdx,8), %rax
-; X64-NEXT: subq %rax, %rdx
-; X64-NEXT: addq %rdi, %rdx
-; X64-NEXT: movq %rdx, %xmm0
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: imulq %r8
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $63, %rax
-; X64-NEXT: sarq %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: leaq (,%rdx,8), %rax
-; X64-NEXT: subq %rax, %rdx
-; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: movq %rdx, %xmm1
-; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm1, %eax
+; X64-NEXT: movslq %eax, %rcx
+; X64-NEXT: imulq $-1840700269, %rcx, %rax # imm = 0x92492493
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: movl %eax, %edx
+; X64-NEXT: shrl $31, %edx
+; X64-NEXT: sarl $2, %eax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: leal (,%rax,8), %edx
+; X64-NEXT: subl %edx, %eax
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: movd %xmm0, %ecx
+; X64-NEXT: movslq %ecx, %rcx
+; X64-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: addl %ecx, %edx
+; X64-NEXT: movl %edx, %edi
+; X64-NEXT: shrl $31, %edi
+; X64-NEXT: sarl $2, %edx
+; X64-NEXT: addl %edi, %edx
+; X64-NEXT: leal (,%rdx,8), %edi
+; X64-NEXT: subl %edi, %edx
+; X64-NEXT: addl %ecx, %edx
+; X64-NEXT: movd %edx, %xmm0
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_srem7_v2i32:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edi
-; X86-NEXT: movl 4(%eax), %eax
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: sarl $31, %ebx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $7
-; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __moddi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $7
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: calll __moddi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu (%esp), %xmm1 # 16-byte Reload
-; X86-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%esi)
-; X86-NEXT: addl $16, %esp
+; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: movd %xmm0, %esi
+; X86-NEXT: movl $-1840700269, %ebx # imm = 0x92492493
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: imull %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $2, %edi
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: leal (,%edi,8), %eax
+; X86-NEXT: subl %eax, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %ebx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $2, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: leal (,%edx,8), %eax
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movd %edx, %xmm0
+; X86-NEXT: movd %edi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%ebp)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_srem7_v2i32:
@@ -784,66 +803,53 @@ define void @test_udiv_v2i32(<2 x i32>*
; X64: # %bb.0:
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pxor %xmm1, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT: movq %xmm0, %rax
-; X64-NEXT: movq %xmm2, %rsi
+; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
+; X64-NEXT: movd %xmm2, %esi
; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divq %rsi
-; X64-NEXT: movq %rax, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: movq %xmm0, %rax
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; X64-NEXT: movq %xmm0, %rsi
+; X64-NEXT: divl %esi
+; X64-NEXT: movl %eax, %esi
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movd %xmm1, %edi
; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divq %rsi
-; X64-NEXT: movq %rax, %xmm0
-; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-NEXT: divl %edi
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movd %esi, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movq %xmm0, (%rcx)
; X64-NEXT: retq
;
; X86-LABEL: test_udiv_v2i32:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $56, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: pxor %xmm1, %xmm1
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT: movdqu %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT: movdqa %xmm2, %xmm3
-; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: calll __udivdi3
+; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: movd %xmm1, %ebx
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
+; X86-NEXT: movd %xmm1, %esi
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: divl %esi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: divl %ebx
; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%esi)
-; X86-NEXT: addl $56, %esp
+; X86-NEXT: movd %esi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%edi)
; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_udiv_v2i32:
@@ -901,66 +907,53 @@ define void @test_urem_v2i32(<2 x i32>*
; X64: # %bb.0:
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pxor %xmm1, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT: movq %xmm0, %rax
-; X64-NEXT: movq %xmm2, %rsi
+; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
+; X64-NEXT: movd %xmm2, %esi
; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divq %rsi
-; X64-NEXT: movq %rdx, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: movq %xmm0, %rax
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; X64-NEXT: movq %xmm0, %rsi
+; X64-NEXT: divl %esi
+; X64-NEXT: movl %edx, %esi
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movd %xmm1, %edi
; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divq %rsi
-; X64-NEXT: movq %rdx, %xmm0
-; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-NEXT: divl %edi
+; X64-NEXT: movd %edx, %xmm0
+; X64-NEXT: movd %esi, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movq %xmm0, (%rcx)
; X64-NEXT: retq
;
; X86-LABEL: test_urem_v2i32:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $56, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: pxor %xmm1, %xmm1
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT: movdqu %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT: movdqa %xmm2, %xmm3
-; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%esi)
-; X86-NEXT: addl $56, %esp
+; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: movd %xmm1, %ebx
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
+; X86-NEXT: movd %xmm1, %esi
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: divl %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: divl %ebx
+; X86-NEXT: movd %edx, %xmm0
+; X86-NEXT: movd %esi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%edi)
; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_urem_v2i32:
@@ -1016,69 +1009,55 @@ define void @test_urem_v2i32(<2 x i32>*
define void @test_sdiv_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwind {
; X64-LABEL: test_sdiv_v2i32:
; X64: # %bb.0:
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movslq (%rdi), %rcx
-; X64-NEXT: movslq 4(%rdi), %rax
-; X64-NEXT: movslq (%rsi), %rdi
-; X64-NEXT: movslq 4(%rsi), %rsi
-; X64-NEXT: cqto
-; X64-NEXT: idivq %rsi
-; X64-NEXT: movq %rax, %xmm0
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: cqto
-; X64-NEXT: idivq %rdi
-; X64-NEXT: movq %rax, %xmm1
-; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X64-NEXT: movq %xmm0, (%r8)
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
+; X64-NEXT: movd %xmm2, %esi
+; X64-NEXT: cltd
+; X64-NEXT: idivl %esi
+; X64-NEXT: movl %eax, %esi
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movd %xmm1, %edi
+; X64-NEXT: cltd
+; X64-NEXT: idivl %edi
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movd %esi, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movq %xmm0, (%rcx)
; X64-NEXT: retq
;
; X86-LABEL: test_sdiv_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $44, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl (%ecx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%ecx), %esi
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl (%eax), %ebx
-; X86-NEXT: movl 4(%eax), %ecx
-; X86-NEXT: movl %ebx, %ebp
-; X86-NEXT: sarl $31, %ebp
+; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: movd %xmm1, %ebx
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
+; X86-NEXT: movd %xmm1, %esi
+; X86-NEXT: cltd
+; X86-NEXT: idivl %esi
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: sarl $31, %eax
-; X86-NEXT: pushl %eax
-; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl %edx
-; X86-NEXT: pushl %esi
-; X86-NEXT: calll __divdi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: calll __divdi3
-; X86-NEXT: addl $16, %esp
+; X86-NEXT: cltd
+; X86-NEXT: idivl %ebx
; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movq %xmm0, (%eax)
-; X86-NEXT: addl $44, %esp
+; X86-NEXT: movd %esi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%edi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_sdiv_v2i32:
@@ -1134,69 +1113,55 @@ define void @test_sdiv_v2i32(<2 x i32>*
define void @test_srem_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwind {
; X64-LABEL: test_srem_v2i32:
; X64: # %bb.0:
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movslq (%rdi), %rcx
-; X64-NEXT: movslq 4(%rdi), %rax
-; X64-NEXT: movslq (%rsi), %rdi
-; X64-NEXT: movslq 4(%rsi), %rsi
-; X64-NEXT: cqto
-; X64-NEXT: idivq %rsi
-; X64-NEXT: movq %rax, %xmm0
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: cqto
-; X64-NEXT: idivq %rdi
-; X64-NEXT: movq %rax, %xmm1
-; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X64-NEXT: movq %xmm0, (%r8)
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
+; X64-NEXT: movd %xmm2, %esi
+; X64-NEXT: cltd
+; X64-NEXT: idivl %esi
+; X64-NEXT: movl %eax, %esi
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movd %xmm1, %edi
+; X64-NEXT: cltd
+; X64-NEXT: idivl %edi
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movd %esi, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movq %xmm0, (%rcx)
; X64-NEXT: retq
;
; X86-LABEL: test_srem_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $44, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl (%ecx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%ecx), %esi
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl (%eax), %ebx
-; X86-NEXT: movl 4(%eax), %ecx
-; X86-NEXT: movl %ebx, %ebp
-; X86-NEXT: sarl $31, %ebp
+; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: movd %xmm1, %ebx
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
+; X86-NEXT: movd %xmm1, %esi
+; X86-NEXT: cltd
+; X86-NEXT: idivl %esi
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: sarl $31, %eax
-; X86-NEXT: pushl %eax
-; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl %edx
-; X86-NEXT: pushl %esi
-; X86-NEXT: calll __divdi3
-; X86-NEXT: addl $16, %esp
+; X86-NEXT: cltd
+; X86-NEXT: idivl %ebx
; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: calll __divdi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movq %xmm0, (%eax)
-; X86-NEXT: addl $44, %esp
+; X86-NEXT: movd %esi, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: movq %xmm0, (%edi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_srem_v2i32:
More information about the llvm-commits
mailing list