[llvm] r342114 - [X86] Type legalize v2i32 div/rem by scalarizing rather than promoting

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 12 23:13:38 PDT 2018


Author: ctopper
Date: Wed Sep 12 23:13:37 2018
New Revision: 342114

URL: http://llvm.org/viewvc/llvm-project?rev=342114&view=rev
Log:
[X86] Type legalize v2i32 div/rem by scalarizing rather than promoting

Summary:
Previously we type legalized v2i32 div/rem by promoting to v2i64. But we don't support div/rem of vectors so op legalization would then scalarize it using i64 scalar ops since it doesn't know about the original promotion. 64-bit scalar divides on Intel hardware are known to be slow and in 32-bit mode they require a libcall.

This patch switches type legalization to do the scalarizing itself using i32.

It looks like the division by power of 2 optimization is still kicking in and leaving the code as a vector. The division by other constant optimization doesn't kick in pre type legalization since it ignores illegal types. And previously, after type legalization we scalarized the v2i64 since we don't have v2i64 MULHS/MULHU support.

Another option might be to widen v2i32 to v4i32 so we could do division by constant optimizations, but we'd have to be careful to only do that for constant divisors or we risk scalaring to 4 scalar divides.

Reviewers: RKSimon, spatel

Reviewed By: spatel

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D51325

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll
    llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=342114&r1=342113&r2=342114&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Sep 12 23:13:37 2018
@@ -779,6 +779,11 @@ X86TargetLowering::X86TargetLowering(con
     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
 
+    setOperationAction(ISD::SDIV, MVT::v2i32, Custom);
+    setOperationAction(ISD::SREM, MVT::v2i32, Custom);
+    setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
+    setOperationAction(ISD::UREM, MVT::v2i32, Custom);
+
     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
@@ -25828,6 +25833,18 @@ void X86TargetLowering::ReplaceNodeResul
   case ISD::UDIV:
   case ISD::SREM:
   case ISD::UREM:
+    if (N->getValueType(0) == MVT::v2i32) {
+      // If we're already legalizing via widening, we don't need this since
+      // that will scalarize div/rem.
+      if (getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector)
+        return;
+      // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the
+      // v2i64 and unroll later. But then we create i64 scalar ops which
+      // might be slow in 64-bit mode or require a libcall in 32-bit mode.
+      Results.push_back(DAG.UnrollVectorOp(N));
+      return;
+    }
+    LLVM_FALLTHROUGH;
   case ISD::SDIVREM:
   case ISD::UDIVREM: {
     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);

Modified: llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll?rev=342114&r1=342113&r2=342114&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll (original)
+++ llvm/trunk/test/CodeGen/X86/scalar_widen_div.ll Wed Sep 12 23:13:37 2018
@@ -13,20 +13,19 @@ define void @vectorDiv (<2 x i32> addrsp
 ; CHECK-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movslq -{{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT:    pmovsxdq (%rdi,%rcx,8), %xmm0
-; CHECK-NEXT:    pmovsxdq (%rsi,%rcx,8), %xmm1
-; CHECK-NEXT:    pextrq $1, %xmm0, %rax
-; CHECK-NEXT:    pextrq $1, %xmm1, %rsi
-; CHECK-NEXT:    cqto
-; CHECK-NEXT:    idivq %rsi
-; CHECK-NEXT:    movq %rax, %xmm2
-; CHECK-NEXT:    movq %xmm0, %rax
-; CHECK-NEXT:    movq %xmm1, %rsi
-; CHECK-NEXT:    cqto
-; CHECK-NEXT:    idivq %rsi
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    pextrd $1, %xmm0, %eax
+; CHECK-NEXT:    pextrd $1, %xmm1, %esi
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %esi
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    movd %xmm1, %edi
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %edi
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    pinsrd $1, %esi, %xmm0
 ; CHECK-NEXT:    movq %xmm0, (%r8,%rcx,8)
 ; CHECK-NEXT:    retq
 entry:

Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll?rev=342114&r1=342113&r2=342114&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll Wed Sep 12 23:13:37 2018
@@ -8,60 +8,58 @@ define void @test_udiv7_v2i32(<2 x i32>*
 ; X64-LABEL: test_udiv7_v2i32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    imulq $613566757, %rax, %rcx # imm = 0x24924925
+; X64-NEXT:    shrq $32, %rcx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    shrl $2, %eax
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; X64-NEXT:    shrq $32, %rdx
+; X64-NEXT:    subl %edx, %ecx
+; X64-NEXT:    shrl %ecx
+; X64-NEXT:    addl %edx, %ecx
+; X64-NEXT:    shrl $2, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    movd %eax, %xmm1
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT:    movq %xmm0, %rcx
-; X64-NEXT:    movabsq $2635249153387078803, %rdi # imm = 0x2492492492492493
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    subq %rdx, %rcx
-; X64-NEXT:    shrq %rcx
-; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    shrq $2, %rcx
-; X64-NEXT:    movq %rcx, %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT:    movq %xmm0, %rcx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    subq %rdx, %rcx
-; X64-NEXT:    shrq %rcx
-; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    shrq $2, %rcx
-; X64-NEXT:    movq %rcx, %xmm0
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X64-NEXT:    movq %xmm0, (%rsi)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_udiv7_v2i32:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __udivdi3
-; X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movd %xmm0, %ecx
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    calll __udivdi3
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X86-NEXT:    movq %xmm0, (%esi)
-; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    movd %xmm0, %esi
+; X86-NEXT:    movl $613566757, %ebx # imm = 0x24924925
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    subl %edx, %esi
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    shrl $2, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    movd %ecx, %xmm0
+; X86-NEXT:    movd %esi, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movq %xmm0, (%edi)
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64_WIDEN-LABEL: test_udiv7_v2i32:
@@ -126,68 +124,76 @@ define void @test_urem7_v2i32(<2 x i32>*
 ; X64-LABEL: test_urem7_v2i32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; X64-NEXT:    shrq $32, %rdx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    subl %edx, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    addl %edx, %eax
+; X64-NEXT:    shrl $2, %eax
+; X64-NEXT:    leal (,%rax,8), %edx
+; X64-NEXT:    subl %edx, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; X64-NEXT:    shrq $32, %rdx
+; X64-NEXT:    movl %ecx, %edi
+; X64-NEXT:    subl %edx, %edi
+; X64-NEXT:    shrl %edi
+; X64-NEXT:    addl %edx, %edi
+; X64-NEXT:    shrl $2, %edi
+; X64-NEXT:    leal (,%rdi,8), %edx
+; X64-NEXT:    subl %edx, %edi
+; X64-NEXT:    addl %ecx, %edi
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    movd %eax, %xmm1
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT:    movq %xmm0, %rcx
-; X64-NEXT:    movabsq $2635249153387078803, %rdi # imm = 0x2492492492492493
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    subq %rdx, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    shrq $2, %rax
-; X64-NEXT:    leaq (,%rax,8), %rdx
-; X64-NEXT:    subq %rdx, %rax
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movq %rax, %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT:    movq %xmm0, %rcx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    subq %rdx, %rax
-; X64-NEXT:    shrq %rax
-; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    shrq $2, %rax
-; X64-NEXT:    leaq (,%rax,8), %rdx
-; X64-NEXT:    subq %rdx, %rax
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movq %rax, %xmm0
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X64-NEXT:    movq %xmm0, (%rsi)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_urem7_v2i32:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __umoddi3
-; X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movd %xmm0, %ecx
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; X86-NEXT:    movss %xmm0, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $7, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    calll __umoddi3
+; X86-NEXT:    movd %xmm0, %esi
+; X86-NEXT:    movl $613566757, %edi # imm = 0x24924925
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    subl %edx, %ebx
+; X86-NEXT:    shrl %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    shrl $2, %ebx
+; X86-NEXT:    leal (,%ebx,8), %eax
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (,%eax,8), %edx
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X86-NEXT:    movq %xmm0, (%esi)
-; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    movd %ebx, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movq %xmm0, (%ebp)
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64_WIDEN-LABEL: test_urem7_v2i32:
@@ -269,64 +275,68 @@ define void @test_urem7_v2i32(<2 x i32>*
 define void @test_sdiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X64-LABEL: test_sdiv7_v2i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movslq (%rdi), %rcx
-; X64-NEXT:    movslq 4(%rdi), %rax
-; X64-NEXT:    movabsq $5270498306774157605, %rdi # imm = 0x4924924924924925
-; X64-NEXT:    imulq %rdi
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    shrq $63, %rax
-; X64-NEXT:    sarq %rdx
-; X64-NEXT:    addq %rax, %rdx
-; X64-NEXT:    movq %rdx, %xmm0
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    imulq %rdi
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    shrq $63, %rax
-; X64-NEXT:    sarq %rdx
-; X64-NEXT:    addq %rax, %rdx
-; X64-NEXT:    movq %rdx, %xmm1
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    cltq
+; X64-NEXT:    imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; X64-NEXT:    shrq $32, %rcx
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    shrl $31, %ecx
+; X64-NEXT:    sarl $2, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    movslq %ecx, %rcx
+; X64-NEXT:    imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; X64-NEXT:    shrq $32, %rdx
+; X64-NEXT:    addl %edx, %ecx
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    shrl $31, %edx
+; X64-NEXT:    sarl $2, %ecx
+; X64-NEXT:    addl %edx, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    movd %eax, %xmm1
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    movq %xmm0, (%rsi)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_sdiv7_v2i32:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edi
-; X86-NEXT:    movl 4(%eax), %eax
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $7
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    calll __divdi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu %xmm0, (%esp) # 16-byte Spill
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $7
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    calll __divdi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu (%esp), %xmm1 # 16-byte Reload
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT:    movq %xmm0, (%esi)
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movd %xmm0, %ecx
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT:    movd %xmm0, %esi
+; X86-NEXT:    movl $-1840700269, %ebp # imm = 0x92492493
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    sarl $2, %edi
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    imull %ebp
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    sarl $2, %edx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    movd %edx, %xmm0
+; X86-NEXT:    movd %edi, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movq %xmm0, (%ebx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64_WIDEN-LABEL: test_sdiv7_v2i32:
@@ -399,71 +409,80 @@ define void @test_sdiv7_v2i32(<2 x i32>*
 define void @test_srem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X64-LABEL: test_srem7_v2i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movslq (%rdi), %rcx
-; X64-NEXT:    movslq 4(%rdi), %rdi
-; X64-NEXT:    movabsq $5270498306774157605, %r8 # imm = 0x4924924924924925
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    imulq %r8
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    shrq $63, %rax
-; X64-NEXT:    sarq %rdx
-; X64-NEXT:    addq %rax, %rdx
-; X64-NEXT:    leaq (,%rdx,8), %rax
-; X64-NEXT:    subq %rax, %rdx
-; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq %rdx, %xmm0
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    imulq %r8
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    shrq $63, %rax
-; X64-NEXT:    sarq %rdx
-; X64-NEXT:    addq %rax, %rdx
-; X64-NEXT:    leaq (,%rdx,8), %rax
-; X64-NEXT:    subq %rax, %rdx
-; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    movq %rdx, %xmm1
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    movslq %eax, %rcx
+; X64-NEXT:    imulq $-1840700269, %rcx, %rax # imm = 0x92492493
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    movl %eax, %edx
+; X64-NEXT:    shrl $31, %edx
+; X64-NEXT:    sarl $2, %eax
+; X64-NEXT:    addl %edx, %eax
+; X64-NEXT:    leal (,%rax,8), %edx
+; X64-NEXT:    subl %edx, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    movslq %ecx, %rcx
+; X64-NEXT:    imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; X64-NEXT:    shrq $32, %rdx
+; X64-NEXT:    addl %ecx, %edx
+; X64-NEXT:    movl %edx, %edi
+; X64-NEXT:    shrl $31, %edi
+; X64-NEXT:    sarl $2, %edx
+; X64-NEXT:    addl %edi, %edx
+; X64-NEXT:    leal (,%rdx,8), %edi
+; X64-NEXT:    subl %edi, %edx
+; X64-NEXT:    addl %ecx, %edx
+; X64-NEXT:    movd %edx, %xmm0
+; X64-NEXT:    movd %eax, %xmm1
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    movq %xmm0, (%rsi)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_srem7_v2i32:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edi
-; X86-NEXT:    movl 4(%eax), %eax
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $7
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    calll __moddi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu %xmm0, (%esp) # 16-byte Spill
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $7
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    calll __moddi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu (%esp), %xmm1 # 16-byte Reload
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT:    movq %xmm0, (%esi)
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movd %xmm0, %ecx
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT:    movd %xmm0, %esi
+; X86-NEXT:    movl $-1840700269, %ebx # imm = 0x92492493
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    sarl $2, %edi
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    leal (,%edi,8), %eax
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    imull %ebx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    sarl $2, %edx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    leal (,%edx,8), %eax
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movd %edx, %xmm0
+; X86-NEXT:    movd %edi, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movq %xmm0, (%ebp)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64_WIDEN-LABEL: test_srem7_v2i32:
@@ -784,66 +803,53 @@ define void @test_udiv_v2i32(<2 x i32>*
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pxor %xmm1, %xmm1
-; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT:    movq %xmm0, %rax
-; X64-NEXT:    movq %xmm2, %rsi
+; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
+; X64-NEXT:    movd %xmm2, %esi
 ; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    divq %rsi
-; X64-NEXT:    movq %rax, %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT:    movq %xmm0, %rax
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; X64-NEXT:    movq %xmm0, %rsi
+; X64-NEXT:    divl %esi
+; X64-NEXT:    movl %eax, %esi
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    movd %xmm1, %edi
 ; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    divq %rsi
-; X64-NEXT:    movq %rax, %xmm0
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-NEXT:    divl %edi
+; X64-NEXT:    movd %eax, %xmm0
+; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    movq %xmm0, (%rcx)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_udiv_v2i32:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    pxor %xmm1, %xmm1
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT:    movdqu %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT:    movdqa %xmm2, %xmm3
-; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    calll __udivdi3
-; X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    movd %xmm0, %ecx
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    movd %xmm1, %ebx
+; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
+; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    divl %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    divl %ebx
 ; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X86-NEXT:    movq %xmm0, (%esi)
-; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    movd %esi, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movq %xmm0, (%edi)
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64_WIDEN-LABEL: test_udiv_v2i32:
@@ -901,66 +907,53 @@ define void @test_urem_v2i32(<2 x i32>*
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pxor %xmm1, %xmm1
-; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT:    movq %xmm0, %rax
-; X64-NEXT:    movq %xmm2, %rsi
+; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
+; X64-NEXT:    movd %xmm2, %esi
 ; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    divq %rsi
-; X64-NEXT:    movq %rdx, %xmm1
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT:    movq %xmm0, %rax
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; X64-NEXT:    movq %xmm0, %rsi
+; X64-NEXT:    divl %esi
+; X64-NEXT:    movl %edx, %esi
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    movd %xmm1, %edi
 ; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    divq %rsi
-; X64-NEXT:    movq %rdx, %xmm0
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-NEXT:    divl %edi
+; X64-NEXT:    movd %edx, %xmm0
+; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    movq %xmm0, (%rcx)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_urem_v2i32:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    pxor %xmm1, %xmm1
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT:    movdqu %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT:    movdqa %xmm2, %xmm3
-; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    calll __umoddi3
-; X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    calll __umoddi3
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X86-NEXT:    movq %xmm0, (%esi)
-; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    movd %xmm0, %ecx
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    movd %xmm1, %ebx
+; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
+; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    divl %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    divl %ebx
+; X86-NEXT:    movd %edx, %xmm0
+; X86-NEXT:    movd %esi, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movq %xmm0, (%edi)
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64_WIDEN-LABEL: test_urem_v2i32:
@@ -1016,69 +1009,55 @@ define void @test_urem_v2i32(<2 x i32>*
 define void @test_sdiv_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwind {
 ; X64-LABEL: test_sdiv_v2i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movslq (%rdi), %rcx
-; X64-NEXT:    movslq 4(%rdi), %rax
-; X64-NEXT:    movslq (%rsi), %rdi
-; X64-NEXT:    movslq 4(%rsi), %rsi
-; X64-NEXT:    cqto
-; X64-NEXT:    idivq %rsi
-; X64-NEXT:    movq %rax, %xmm0
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    cqto
-; X64-NEXT:    idivq %rdi
-; X64-NEXT:    movq %rax, %xmm1
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X64-NEXT:    movq %xmm0, (%r8)
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
+; X64-NEXT:    movd %xmm2, %esi
+; X64-NEXT:    cltd
+; X64-NEXT:    idivl %esi
+; X64-NEXT:    movl %eax, %esi
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    movd %xmm1, %edi
+; X64-NEXT:    cltd
+; X64-NEXT:    idivl %edi
+; X64-NEXT:    movd %eax, %xmm0
+; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    movq %xmm0, (%rcx)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_sdiv_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $44, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl (%eax), %ebx
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    movd %xmm0, %ecx
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    movd %xmm1, %ebx
+; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
+; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    cltd
+; X86-NEXT:    idivl %esi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    calll __divdi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    calll __divdi3
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    cltd
+; X86-NEXT:    idivl %ebx
 ; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movq %xmm0, (%eax)
-; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    movd %esi, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movq %xmm0, (%edi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64_WIDEN-LABEL: test_sdiv_v2i32:
@@ -1134,69 +1113,55 @@ define void @test_sdiv_v2i32(<2 x i32>*
 define void @test_srem_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwind {
 ; X64-LABEL: test_srem_v2i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movslq (%rdi), %rcx
-; X64-NEXT:    movslq 4(%rdi), %rax
-; X64-NEXT:    movslq (%rsi), %rdi
-; X64-NEXT:    movslq 4(%rsi), %rsi
-; X64-NEXT:    cqto
-; X64-NEXT:    idivq %rsi
-; X64-NEXT:    movq %rax, %xmm0
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    cqto
-; X64-NEXT:    idivq %rdi
-; X64-NEXT:    movq %rax, %xmm1
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X64-NEXT:    movq %xmm0, (%r8)
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
+; X64-NEXT:    movd %xmm2, %esi
+; X64-NEXT:    cltd
+; X64-NEXT:    idivl %esi
+; X64-NEXT:    movl %eax, %esi
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    movd %xmm1, %edi
+; X64-NEXT:    cltd
+; X64-NEXT:    idivl %edi
+; X64-NEXT:    movd %eax, %xmm0
+; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    movq %xmm0, (%rcx)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_srem_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $44, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl (%eax), %ebx
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    movd %xmm0, %ecx
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    movd %xmm1, %ebx
+; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
+; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    cltd
+; X86-NEXT:    idivl %esi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    calll __divdi3
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    cltd
+; X86-NEXT:    idivl %ebx
 ; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    calll __divdi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movq %xmm0, (%eax)
-; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    movd %esi, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movq %xmm0, (%edi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64_WIDEN-LABEL: test_srem_v2i32:




More information about the llvm-commits mailing list