[llvm] 7b03725 - Revert "[x86] try harder to scalarize a vector load with extracted integer op uses"

Sanjay Patel via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 4 04:55:16 PST 2022


Author: Sanjay Patel
Date: 2022-02-04T07:45:57-05:00
New Revision: 7b03725097872fbd3369a7213c1d98b372aa2d78

URL: https://github.com/llvm/llvm-project/commit/7b03725097872fbd3369a7213c1d98b372aa2d78
DIFF: https://github.com/llvm/llvm-project/commit/7b03725097872fbd3369a7213c1d98b372aa2d78.diff

LOG: Revert "[x86] try harder to scalarize a vector load with extracted integer op uses"

This reverts commit b4b97ec813a02585000f30ac7d532dda74e8bfda.

As discussed in post-commit feedback at:
https://reviews.llvm.org/D118376
...there's a stage 2 failure on a Mac running a clang-refactor tool test.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
    llvm/test/CodeGen/X86/avx512-cvt.ll
    llvm/test/CodeGen/X86/bitcast-vector-bool.ll
    llvm/test/CodeGen/X86/extractelement-load.ll
    llvm/test/CodeGen/X86/oddsubvector.ll
    llvm/test/CodeGen/X86/pr45378.ll
    llvm/test/CodeGen/X86/scalar_widen_div.ll
    llvm/test/CodeGen/X86/shrink_vmul.ll
    llvm/test/CodeGen/X86/vec_cast.ll
    llvm/test/CodeGen/X86/vec_int_to_fp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1d927d59d1aec..04b5984ce4242 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43106,38 +43106,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // If this extract is from a loaded vector value and will be used as an
-  // integer, that requires a potentially expensive XMM -> GPR transfer.
-  // Additionally, if we can convert to a scalar integer load, that will likely
-  // be folded into a subsequent integer op.
-  // Note: Unlike the related fold for this in DAGCombiner, this is not limited
-  //       to a single-use of the loaded vector. For the reasons above, we
-  //       expect this to be profitable even if it creates an extra load.
-  bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
-    return Use->getOpcode() == ISD::STORE ||
-           Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
-           Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
-  });
-  auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
-  if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
-      SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
-      !LikelyUsedAsVector) {
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    SDValue NewPtr =
-        TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
-    unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
-    MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
-    Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
-    SDValue Load =
-        DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
-                    LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
-    SDValue Chain = Load.getValue(1);
-    SDValue From[] = {SDValue(N, 0), SDValue(LoadVec, 1)};
-    SDValue To[] = {Load, Chain};
-    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
-    return SDValue(N, 0);
-  }
-
   return SDValue();
 }
 

diff  --git a/llvm/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll b/llvm/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
index 5fb01b8c1b992..ad52d58bde1c6 100644
--- a/llvm/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
+++ b/llvm/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
@@ -10,13 +10,13 @@
 define <4 x i32> @test(<4 x i32>* %p) {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl $3, 8(%rdi)
-; CHECK-NEXT:    je .LBB0_1
-; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB0_1:
 ; CHECK-NEXT:    movaps (%rdi), %xmm0
+; CHECK-NEXT:    extractps $2, %xmm0, %eax
+; CHECK-NEXT:    cmpl $3, %eax
+; CHECK-NEXT:    je .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    retq
   %v = load <4 x i32>, <4 x i32>* %p
   %e = extractelement <4 x i32> %v, i32 2

diff  --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll
index f632db7160e60..5d7940b2fa7ce 100644
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@@ -148,12 +148,18 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
 define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
 ; NODQ-LABEL: slto4f32_mem:
 ; NODQ:       # %bb.0:
-; NODQ-NEXT:    vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
-; NODQ-NEXT:    vcvtsi2ssq (%rdi), %xmm1, %xmm1
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; NODQ-NEXT:    vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; NODQ-NEXT:    vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
+; NODQ-NEXT:    vmovdqu (%rdi), %xmm0
+; NODQ-NEXT:    vmovdqu 16(%rdi), %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; NODQ-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index a47dbe570fd52..bf3ffecb414ff 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -542,8 +542,10 @@ define i32 @bitcast_v64i8_to_v2i32(<64 x i8> %a0) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512-NEXT:    kmovq %k0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; AVX512-NEXT:    addl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %ecx
+; AVX512-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX512-NEXT:    addl %ecx, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = icmp slt <64 x i8> %a0, zeroinitializer

diff  --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index c0c32eb3cfce8..2a7ed3a8b4e71 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -301,35 +301,33 @@ define void @subextract_broadcast_load_constant(<2 x i16>* nocapture %0, i16* no
   ret void
 }
 
-; A scalar load is favored over a XMM->GPR register transfer in this example.
-
 define i32 @multi_use_load_scalarization(<4 x i32>* %p) {
 ; X32-SSE2-LABEL: multi_use_load_scalarization:
 ; X32-SSE2:       # %bb.0:
 ; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE2-NEXT:    movl (%ecx), %eax
 ; X32-SSE2-NEXT:    movdqu (%ecx), %xmm0
 ; X32-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE2-NEXT:    movd %xmm0, %eax
 ; X32-SSE2-NEXT:    psubd %xmm1, %xmm0
 ; X32-SSE2-NEXT:    movdqa %xmm0, (%ecx)
 ; X32-SSE2-NEXT:    retl
 ;
 ; X64-SSSE3-LABEL: multi_use_load_scalarization:
 ; X64-SSSE3:       # %bb.0:
-; X64-SSSE3-NEXT:    movl (%rdi), %eax
 ; X64-SSSE3-NEXT:    movdqu (%rdi), %xmm0
 ; X64-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-SSSE3-NEXT:    movd %xmm0, %eax
 ; X64-SSSE3-NEXT:    psubd %xmm1, %xmm0
 ; X64-SSSE3-NEXT:    movdqa %xmm0, (%rdi)
 ; X64-SSSE3-NEXT:    retq
 ;
 ; X64-AVX-LABEL: multi_use_load_scalarization:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    movl (%rdi), %eax
 ; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
 ; X64-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovdqa %xmm0, (%rdi)
+; X64-AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
+; X64-AVX-NEXT:    vmovdqa %xmm1, (%rdi)
+; X64-AVX-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX-NEXT:    retq
   %v = load <4 x i32>, <4 x i32>* %p, align 1
   %v1 = add <4 x i32> %v, <i32 1, i32 1, i32 1, i32 1>

diff  --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index fb4c23e7de1a4..a28a64f4fc60b 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -161,46 +161,46 @@ define <16 x i32> @PR42819(<8 x i32>* %a0) {
 define void @PR42833() {
 ; SSE2-LABEL: PR42833:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movl b(%rip), %eax
-; SSE2-NEXT:    movdqa c+144(%rip), %xmm0
-; SSE2-NEXT:    movdqa c+128(%rip), %xmm1
-; SSE2-NEXT:    addl c+128(%rip), %eax
+; SSE2-NEXT:    movdqa c+144(%rip), %xmm1
+; SSE2-NEXT:    movdqa c+128(%rip), %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    addl b(%rip), %eax
 ; SSE2-NEXT:    movd %eax, %xmm2
 ; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    paddd %xmm1, %xmm3
+; SSE2-NEXT:    paddd %xmm0, %xmm3
 ; SSE2-NEXT:    movdqa d+144(%rip), %xmm4
-; SSE2-NEXT:    psubd %xmm0, %xmm4
-; SSE2-NEXT:    paddd %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    paddd %xmm1, %xmm5
+; SSE2-NEXT:    psubd %xmm1, %xmm4
+; SSE2-NEXT:    paddd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    paddd %xmm0, %xmm5
 ; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
-; SSE2-NEXT:    movdqa %xmm0, c+144(%rip)
+; SSE2-NEXT:    movdqa %xmm1, c+144(%rip)
 ; SSE2-NEXT:    movaps %xmm5, c+128(%rip)
-; SSE2-NEXT:    movdqa c+160(%rip), %xmm0
+; SSE2-NEXT:    movdqa c+160(%rip), %xmm1
 ; SSE2-NEXT:    movdqa c+176(%rip), %xmm3
 ; SSE2-NEXT:    movdqa d+160(%rip), %xmm5
 ; SSE2-NEXT:    movdqa d+176(%rip), %xmm6
 ; SSE2-NEXT:    movdqa d+128(%rip), %xmm7
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
-; SSE2-NEXT:    psubd %xmm1, %xmm7
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE2-NEXT:    psubd %xmm0, %xmm7
 ; SSE2-NEXT:    psubd %xmm3, %xmm6
-; SSE2-NEXT:    psubd %xmm0, %xmm5
+; SSE2-NEXT:    psubd %xmm1, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, d+160(%rip)
 ; SSE2-NEXT:    movdqa %xmm6, d+176(%rip)
 ; SSE2-NEXT:    movdqa %xmm4, d+144(%rip)
 ; SSE2-NEXT:    movdqa %xmm7, d+128(%rip)
 ; SSE2-NEXT:    paddd %xmm3, %xmm3
-; SSE2-NEXT:    paddd %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, c+160(%rip)
+; SSE2-NEXT:    paddd %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, c+160(%rip)
 ; SSE2-NEXT:    movdqa %xmm3, c+176(%rip)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: PR42833:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movl b(%rip), %eax
 ; SSE42-NEXT:    movdqa c+144(%rip), %xmm0
 ; SSE42-NEXT:    movdqa c+128(%rip), %xmm1
-; SSE42-NEXT:    addl c+128(%rip), %eax
+; SSE42-NEXT:    movd %xmm1, %eax
+; SSE42-NEXT:    addl b(%rip), %eax
 ; SSE42-NEXT:    movd %eax, %xmm2
 ; SSE42-NEXT:    paddd %xmm1, %xmm2
 ; SSE42-NEXT:    movdqa d+144(%rip), %xmm3
@@ -232,20 +232,20 @@ define void @PR42833() {
 ;
 ; AVX1-LABEL: PR42833:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    movl b(%rip), %eax
-; AVX1-NEXT:    addl c+128(%rip), %eax
-; AVX1-NEXT:    vmovd %eax, %xmm0
-; AVX1-NEXT:    vmovdqa c+128(%rip), %xmm1
-; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpaddd %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vmovdqa c+128(%rip), %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    addl b(%rip), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
 ; AVX1-NEXT:    vmovdqa c+144(%rip), %xmm3
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
 ; AVX1-NEXT:    vmovdqa d+144(%rip), %xmm2
 ; AVX1-NEXT:    vpsubd c+144(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vmovups %ymm0, c+128(%rip)
-; AVX1-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm0
+; AVX1-NEXT:    vmovups %ymm1, c+128(%rip)
+; AVX1-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqa d+128(%rip), %xmm1
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vmovdqa d+176(%rip), %xmm1
@@ -314,20 +314,20 @@ define void @PR42833() {
 ;
 ; XOP-LABEL: PR42833:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    movl b(%rip), %eax
-; XOP-NEXT:    addl c+128(%rip), %eax
-; XOP-NEXT:    vmovd %eax, %xmm0
-; XOP-NEXT:    vmovdqa c+128(%rip), %xmm1
-; XOP-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; XOP-NEXT:    vpaddd %xmm1, %xmm1, %xmm2
+; XOP-NEXT:    vmovdqa c+128(%rip), %xmm0
+; XOP-NEXT:    vmovd %xmm0, %eax
+; XOP-NEXT:    addl b(%rip), %eax
+; XOP-NEXT:    vmovd %eax, %xmm1
+; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
 ; XOP-NEXT:    vmovdqa c+144(%rip), %xmm3
 ; XOP-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
 ; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
+; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
 ; XOP-NEXT:    vmovdqa d+144(%rip), %xmm2
 ; XOP-NEXT:    vpsubd c+144(%rip), %xmm2, %xmm2
-; XOP-NEXT:    vmovups %ymm0, c+128(%rip)
-; XOP-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm0
+; XOP-NEXT:    vmovups %ymm1, c+128(%rip)
+; XOP-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
 ; XOP-NEXT:    vmovdqa d+128(%rip), %xmm1
 ; XOP-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; XOP-NEXT:    vmovdqa d+176(%rip), %xmm1

diff  --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll
index 0f2b212242793..d5da5c73a891d 100644
--- a/llvm/test/CodeGen/X86/pr45378.ll
+++ b/llvm/test/CodeGen/X86/pr45378.ll
@@ -76,23 +76,28 @@ define i1 @parseHeaders2_scalar_and(i64 * %ptr) nounwind {
 ; SSE2-LABEL: parseHeaders2_scalar_and:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    testq %rcx, %rax
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: parseHeaders2_scalar_and:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq (%rdi), %rax
-; SSE41-NEXT:    testq %rax, 8(%rdi)
+; SSE41-NEXT:    movdqu (%rdi), %xmm0
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE41-NEXT:    testq %rcx, %rax
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: parseHeaders2_scalar_and:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    testq %rax, 8(%rdi)
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX-NEXT:    testq %rcx, %rax
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
   %vptr = bitcast i64 * %ptr to <2 x i64> *

diff  --git a/llvm/test/CodeGen/X86/scalar_widen_div.ll b/llvm/test/CodeGen/X86/scalar_widen_div.ll
index 816892d62a0b3..d21b441580688 100644
--- a/llvm/test/CodeGen/X86/scalar_widen_div.ll
+++ b/llvm/test/CodeGen/X86/scalar_widen_div.ll
@@ -403,29 +403,32 @@ define void @test_int_div(<3 x i32>* %dest, <3 x i32>* %old, i32 %n) {
 ; CHECK-NEXT:    testl %edx, %edx
 ; CHECK-NEXT:    jle .LBB12_3
 ; CHECK-NEXT:  # %bb.1: # %bb.nph
-; CHECK-NEXT:    movl %edx, %r11d
+; CHECK-NEXT:    movl %edx, %r9d
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB12_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl 8(%rdi,%rcx), %r8d
-; CHECK-NEXT:    movl (%rdi,%rcx), %r9d
-; CHECK-NEXT:    movl 4(%rdi,%rcx), %eax
+; CHECK-NEXT:    movdqa (%rdi,%rcx), %xmm0
+; CHECK-NEXT:    movdqa (%rsi,%rcx), %xmm1
+; CHECK-NEXT:    pextrd $1, %xmm0, %eax
+; CHECK-NEXT:    pextrd $1, %xmm1, %r8d
 ; CHECK-NEXT:    cltd
-; CHECK-NEXT:    idivl 4(%rsi,%rcx)
-; CHECK-NEXT:    movl %eax, %r10d
-; CHECK-NEXT:    movl %r9d, %eax
+; CHECK-NEXT:    idivl %r8d
+; CHECK-NEXT:    movl %eax, %r8d
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    movd %xmm1, %r10d
 ; CHECK-NEXT:    cltd
-; CHECK-NEXT:    idivl (%rsi,%rcx)
-; CHECK-NEXT:    movd %eax, %xmm0
-; CHECK-NEXT:    pinsrd $1, %r10d, %xmm0
-; CHECK-NEXT:    movl %r8d, %eax
+; CHECK-NEXT:    idivl %r10d
+; CHECK-NEXT:    movd %eax, %xmm2
+; CHECK-NEXT:    pinsrd $1, %r8d, %xmm2
+; CHECK-NEXT:    pextrd $2, %xmm0, %eax
+; CHECK-NEXT:    pextrd $2, %xmm1, %r8d
 ; CHECK-NEXT:    cltd
-; CHECK-NEXT:    idivl 8(%rsi,%rcx)
+; CHECK-NEXT:    idivl %r8d
 ; CHECK-NEXT:    movl %eax, 8(%rdi,%rcx)
-; CHECK-NEXT:    movq %xmm0, (%rdi,%rcx)
+; CHECK-NEXT:    movq %xmm2, (%rdi,%rcx)
 ; CHECK-NEXT:    addq $16, %rcx
-; CHECK-NEXT:    decl %r11d
+; CHECK-NEXT:    decl %r9d
 ; CHECK-NEXT:    jne .LBB12_2
 ; CHECK-NEXT:  .LBB12_3: # %for.end
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 5558b7eefffb8..82a633b6d8cad 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -2072,10 +2072,10 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X86-SSE-NEXT:    divl %ecx
 ; X86-SSE-NEXT:    movd %edx, %xmm4
 ; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-SSE-NEXT:    movd %xmm1, %ecx
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    divl 16(%esi)
+; X86-SSE-NEXT:    divl %ecx
 ; X86-SSE-NEXT:    movd %edx, %xmm3
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; X86-SSE-NEXT:    movd %xmm2, %eax
@@ -2086,9 +2086,10 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X86-SSE-NEXT:    movd %edx, %xmm1
 ; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; X86-SSE-NEXT:    movd %xmm0, %ecx
 ; X86-SSE-NEXT:    movl %edi, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl (%esi)
+; X86-SSE-NEXT:    divl %ecx
 ; X86-SSE-NEXT:    movd %edx, %xmm1
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
 ; X86-SSE-NEXT:    movd %xmm2, %ecx
@@ -2114,7 +2115,8 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; X86-SSE-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl 32(%esi)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    divl 32(%ecx)
 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199]
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
@@ -2149,43 +2151,53 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vmovd %xmm2, %eax
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vmovd %xmm1, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl 32(%ecx)
 ; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %eax
+; X86-AVX1-NEXT:    vpextrd $3, %xmm2, %eax
+; X86-AVX1-NEXT:    vmovdqa (%ecx), %xmm1
+; X86-AVX1-NEXT:    vmovdqa 16(%ecx), %xmm3
+; X86-AVX1-NEXT:    vpextrd $3, %xmm3, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 28(%ecx)
+; X86-AVX1-NEXT:    divl %ecx
 ; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %eax
+; X86-AVX1-NEXT:    vpextrd $2, %xmm2, %eax
+; X86-AVX1-NEXT:    vpextrd $2, %xmm3, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 24(%ecx)
+; X86-AVX1-NEXT:    divl %ecx
 ; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm2, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm3, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 20(%ecx)
+; X86-AVX1-NEXT:    divl %ecx
 ; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-AVX1-NEXT:    vmovd %xmm1, %eax
+; X86-AVX1-NEXT:    vmovd %xmm2, %eax
+; X86-AVX1-NEXT:    vmovd %xmm3, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 16(%ecx)
+; X86-AVX1-NEXT:    divl %ecx
 ; X86-AVX1-NEXT:    movl %edx, %ebp
 ; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 12(%ecx)
+; X86-AVX1-NEXT:    divl %ecx
 ; X86-AVX1-NEXT:    movl %edx, %ebx
 ; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %esi
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 8(%ecx)
+; X86-AVX1-NEXT:    divl %esi
 ; X86-AVX1-NEXT:    movl %edx, %esi
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %edi
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 4(%ecx)
+; X86-AVX1-NEXT:    divl %edi
 ; X86-AVX1-NEXT:    movl %edx, %edi
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vmovd %xmm1, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl (%ecx)
+; X86-AVX1-NEXT:    divl %ecx
 ; X86-AVX1-NEXT:    vmovd %edx, %xmm0
 ; X86-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
@@ -2211,47 +2223,58 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ;
 ; X86-AVX2-LABEL: PR34947:
 ; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    pushl %edi
 ; X86-AVX2-NEXT:    pushl %esi
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; X86-AVX2-NEXT:    vpextrd $1, %xmm2, %eax
+; X86-AVX2-NEXT:    vmovdqa (%esi), %xmm2
+; X86-AVX2-NEXT:    vmovdqa 16(%esi), %xmm3
+; X86-AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; X86-AVX2-NEXT:    vpextrd $1, %xmm4, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 20(%esi)
+; X86-AVX2-NEXT:    divl %ecx
 ; X86-AVX2-NEXT:    movl %edx, %ecx
-; X86-AVX2-NEXT:    vmovd %xmm2, %eax
+; X86-AVX2-NEXT:    vmovd %xmm3, %edi
+; X86-AVX2-NEXT:    vmovd %xmm4, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 16(%esi)
-; X86-AVX2-NEXT:    vmovd %edx, %xmm3
-; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
-; X86-AVX2-NEXT:    vpextrd $2, %xmm2, %eax
+; X86-AVX2-NEXT:    divl %edi
+; X86-AVX2-NEXT:    vmovd %edx, %xmm5
+; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm5, %xmm5
+; X86-AVX2-NEXT:    vpextrd $2, %xmm3, %ecx
+; X86-AVX2-NEXT:    vpextrd $2, %xmm4, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 24(%esi)
-; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm3, %xmm3
-; X86-AVX2-NEXT:    vpextrd $3, %xmm2, %eax
+; X86-AVX2-NEXT:    divl %ecx
+; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; X86-AVX2-NEXT:    vpextrd $3, %xmm3, %ecx
+; X86-AVX2-NEXT:    vpextrd $3, %xmm4, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 28(%esi)
-; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm3, %xmm2
+; X86-AVX2-NEXT:    divl %ecx
+; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm3
+; X86-AVX2-NEXT:    vpextrd $1, %xmm2, %ecx
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 4(%esi)
+; X86-AVX2-NEXT:    divl %ecx
 ; X86-AVX2-NEXT:    movl %edx, %ecx
+; X86-AVX2-NEXT:    vmovd %xmm2, %edi
 ; X86-AVX2-NEXT:    vmovd %xmm1, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl (%esi)
-; X86-AVX2-NEXT:    vmovd %edx, %xmm3
-; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
+; X86-AVX2-NEXT:    divl %edi
+; X86-AVX2-NEXT:    vmovd %edx, %xmm4
+; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
+; X86-AVX2-NEXT:    vpextrd $2, %xmm2, %ecx
 ; X86-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 8(%esi)
-; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm3, %xmm3
+; X86-AVX2-NEXT:    divl %ecx
+; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; X86-AVX2-NEXT:    vpextrd $3, %xmm2, %ecx
 ; X86-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 12(%esi)
-; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm3, %xmm1
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    divl %ecx
+; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm1
+; X86-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
 ; X86-AVX2-NEXT:    divl 32(%esi)
@@ -2261,6 +2284,7 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X86-AVX2-NEXT:    movl %eax, (%eax)
 ; X86-AVX2-NEXT:    vmovdqa %ymm0, (%eax)
 ; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    popl %edi
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -2293,9 +2317,10 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X64-SSE-NEXT:    divl %edi
 ; X64-SSE-NEXT:    movd %edx, %xmm4
 ; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X64-SSE-NEXT:    movd %xmm1, %edi
 ; X64-SSE-NEXT:    movl %r9d, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl 16(%rsi)
+; X64-SSE-NEXT:    divl %edi
 ; X64-SSE-NEXT:    movd %edx, %xmm3
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; X64-SSE-NEXT:    movd %xmm2, %eax
@@ -2306,9 +2331,10 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X64-SSE-NEXT:    movd %edx, %xmm1
 ; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; X64-SSE-NEXT:    movd %xmm0, %edi
 ; X64-SSE-NEXT:    movl %r10d, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl (%rsi)
+; X64-SSE-NEXT:    divl %edi
 ; X64-SSE-NEXT:    movd %edx, %xmm1
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
 ; X64-SSE-NEXT:    movd %xmm2, %edi
@@ -2359,50 +2385,60 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X64-AVX1-NEXT:    pushq %rbp
 ; X64-AVX1-NEXT:    pushq %rbx
 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vmovd %xmm2, %eax
+; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-AVX1-NEXT:    vmovd %xmm1, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl 32(%rsi)
 ; X64-AVX1-NEXT:    movl %edx, %r8d
-; X64-AVX1-NEXT:    vpextrd $3, %xmm1, %eax
+; X64-AVX1-NEXT:    vpextrd $3, %xmm2, %eax
+; X64-AVX1-NEXT:    vmovdqa (%rsi), %xmm1
+; X64-AVX1-NEXT:    vmovdqa 16(%rsi), %xmm3
+; X64-AVX1-NEXT:    vpextrd $3, %xmm3, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl 28(%rsi)
+; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %r9d
-; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %eax
+; X64-AVX1-NEXT:    vpextrd $2, %xmm2, %eax
+; X64-AVX1-NEXT:    vpextrd $2, %xmm3, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl 24(%rsi)
+; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %r10d
-; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %eax
+; X64-AVX1-NEXT:    vpextrd $1, %xmm2, %eax
+; X64-AVX1-NEXT:    vpextrd $1, %xmm3, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl 20(%rsi)
+; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %r11d
-; X64-AVX1-NEXT:    vmovd %xmm1, %eax
+; X64-AVX1-NEXT:    vmovd %xmm2, %eax
+; X64-AVX1-NEXT:    vmovd %xmm3, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl 16(%rsi)
-; X64-AVX1-NEXT:    movl %edx, %ecx
+; X64-AVX1-NEXT:    divl %ecx
+; X64-AVX1-NEXT:    movl %edx, %esi
 ; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
+; X64-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl 12(%rsi)
+; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %edi
 ; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl 8(%rsi)
-; X64-AVX1-NEXT:    movl %edx, %ebx
+; X64-AVX1-NEXT:    divl %ecx
+; X64-AVX1-NEXT:    movl %edx, %ecx
 ; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
+; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %ebx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl 4(%rsi)
-; X64-AVX1-NEXT:    movl %edx, %ebp
+; X64-AVX1-NEXT:    divl %ebx
+; X64-AVX1-NEXT:    movl %edx, %ebx
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    vmovd %xmm1, %ebp
 ; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl (%rsi)
+; X64-AVX1-NEXT:    divl %ebp
 ; X64-AVX1-NEXT:    vmovd %edx, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $1, %ebp, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $2, %ebx, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
 ; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vmovd %ecx, %xmm2
+; X64-AVX1-NEXT:    vmovd %esi, %xmm2
 ; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
 ; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm2, %xmm2
 ; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm2, %xmm2
@@ -2419,42 +2455,52 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X64-AVX2:       # %bb.0:
 ; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; X64-AVX2-NEXT:    vpextrd $1, %xmm2, %eax
+; X64-AVX2-NEXT:    vmovdqa (%rsi), %xmm2
+; X64-AVX2-NEXT:    vmovdqa 16(%rsi), %xmm3
+; X64-AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; X64-AVX2-NEXT:    vpextrd $1, %xmm4, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl 20(%rsi)
+; X64-AVX2-NEXT:    divl %ecx
 ; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    vmovd %xmm2, %eax
+; X64-AVX2-NEXT:    vmovd %xmm3, %edi
+; X64-AVX2-NEXT:    vmovd %xmm4, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl 16(%rsi)
-; X64-AVX2-NEXT:    vmovd %edx, %xmm3
-; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
-; X64-AVX2-NEXT:    vpextrd $2, %xmm2, %eax
+; X64-AVX2-NEXT:    divl %edi
+; X64-AVX2-NEXT:    vmovd %edx, %xmm5
+; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm5, %xmm5
+; X64-AVX2-NEXT:    vpextrd $2, %xmm3, %ecx
+; X64-AVX2-NEXT:    vpextrd $2, %xmm4, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl 24(%rsi)
-; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm3, %xmm3
-; X64-AVX2-NEXT:    vpextrd $3, %xmm2, %eax
+; X64-AVX2-NEXT:    divl %ecx
+; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; X64-AVX2-NEXT:    vpextrd $3, %xmm3, %ecx
+; X64-AVX2-NEXT:    vpextrd $3, %xmm4, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl 28(%rsi)
-; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm3, %xmm2
+; X64-AVX2-NEXT:    divl %ecx
+; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm3
+; X64-AVX2-NEXT:    vpextrd $1, %xmm2, %ecx
 ; X64-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl 4(%rsi)
+; X64-AVX2-NEXT:    divl %ecx
 ; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    vmovd %xmm2, %edi
 ; X64-AVX2-NEXT:    vmovd %xmm1, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl (%rsi)
-; X64-AVX2-NEXT:    vmovd %edx, %xmm3
-; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
+; X64-AVX2-NEXT:    divl %edi
+; X64-AVX2-NEXT:    vmovd %edx, %xmm4
+; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
+; X64-AVX2-NEXT:    vpextrd $2, %xmm2, %ecx
 ; X64-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl 8(%rsi)
-; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm3, %xmm3
+; X64-AVX2-NEXT:    divl %ecx
+; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; X64-AVX2-NEXT:    vpextrd $3, %xmm2, %ecx
 ; X64-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl 12(%rsi)
-; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm3, %xmm1
-; X64-AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    divl %ecx
+; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm1
+; X64-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX2-NEXT:    xorl %edx, %edx
 ; X64-AVX2-NEXT:    divl 32(%rsi)

diff  --git a/llvm/test/CodeGen/X86/vec_cast.ll b/llvm/test/CodeGen/X86/vec_cast.ll
index e0089354cc953..0a6bc2f59b685 100644
--- a/llvm/test/CodeGen/X86/vec_cast.ll
+++ b/llvm/test/CodeGen/X86/vec_cast.ll
@@ -156,7 +156,7 @@ define <3 x i16> @h(<3 x i32> %a) nounwind {
 ; CHECK-WIN-LABEL: h:
 ; CHECK-WIN:       # %bb.0:
 ; CHECK-WIN-NEXT:    movdqa (%rcx), %xmm0
-; CHECK-WIN-NEXT:    movl (%rcx), %eax
+; CHECK-WIN-NEXT:    movd %xmm0, %eax
 ; CHECK-WIN-NEXT:    pextrw $2, %xmm0, %edx
 ; CHECK-WIN-NEXT:    pextrw $4, %xmm0, %ecx
 ; CHECK-WIN-NEXT:    # kill: def $ax killed $ax killed $eax

diff  --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index e95afd00304f2..12d7b05bf3dac 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -2895,7 +2895,8 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
 ; SSE2-LABEL: sitofp_load_2i64_to_2f64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    cvtsi2sdq (%rdi), %xmm0
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm1, %rax
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
@@ -2905,30 +2906,43 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
 ;
 ; SSE41-LABEL: sitofp_load_2i64_to_2f64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    cvtsi2sdq 8(%rdi), %xmm1
-; SSE41-NEXT:    cvtsi2sdq (%rdi), %xmm0
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2sd %rax, %xmm1
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
 ; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE41-NEXT:    retq
 ;
 ; VEX-LABEL: sitofp_load_2i64_to_2f64:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vcvtsi2sdq 8(%rdi), %xmm0, %xmm0
-; VEX-NEXT:    vcvtsi2sdq (%rdi), %xmm1, %xmm1
-; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; VEX-NEXT:    vmovdqa (%rdi), %xmm0
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
+; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_2i64_to_2f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtsi2sdq 8(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vcvtsi2sdq (%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_2i64_to_2f64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtsi2sdq 8(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtsi2sdq (%rdi), %xmm1, %xmm1
-; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64:
@@ -3078,14 +3092,16 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
-; SSE2-NEXT:    cvtsi2sdq (%rdi), %xmm0
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm1, %rax
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
 ; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movq %xmm2, %rax
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2sdq 16(%rdi), %xmm1
+; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm2, %rax
 ; SSE2-NEXT:    xorps %xmm2, %xmm2
@@ -3095,46 +3111,72 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
 ;
 ; SSE41-LABEL: sitofp_load_4i64_to_4f64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    cvtsi2sdq 8(%rdi), %xmm1
-; SSE41-NEXT:    cvtsi2sdq (%rdi), %xmm0
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT:    cvtsi2sdq 24(%rdi), %xmm2
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2sd %rax, %xmm2
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2sd %rax, %xmm2
+; SSE41-NEXT:    movq %xmm1, %rax
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2sdq 16(%rdi), %xmm1
+; SSE41-NEXT:    cvtsi2sd %rax, %xmm1
 ; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE41-NEXT:    retq
 ;
 ; VEX-LABEL: sitofp_load_4i64_to_4f64:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vcvtsi2sdq 24(%rdi), %xmm0, %xmm0
-; VEX-NEXT:    vcvtsi2sdq 16(%rdi), %xmm1, %xmm1
-; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; VEX-NEXT:    vcvtsi2sdq 8(%rdi), %xmm2, %xmm1
-; VEX-NEXT:    vcvtsi2sdq (%rdi), %xmm2, %xmm2
-; VEX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; VEX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; VEX-NEXT:    vmovapd (%rdi), %xmm0
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; VEX-NEXT:    vpextrq $1, %xmm1, %rax
+; VEX-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; VEX-NEXT:    vmovq %xmm1, %rax
+; VEX-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; VEX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_4i64_to_4f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtsi2sdq 24(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vcvtsi2sdq 16(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vcvtsi2sdq 8(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT:    vcvtsi2sdq (%rdi), %xmm2, %xmm2
-; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vmovapd (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtsi2sdq 24(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtsi2sdq 16(%rdi), %xmm1, %xmm1
-; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    vcvtsi2sdq 8(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT:    vcvtsi2sdq (%rdi), %xmm2, %xmm2
-; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vmovapd (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64:
@@ -3839,14 +3881,16 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    cvtsi2ssq 16(%rdi), %xmm2
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    movq %xmm1, %rax
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ssq (%rdi), %xmm0
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm1, %rax
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
@@ -3857,47 +3901,72 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ;
 ; SSE41-LABEL: sitofp_load_4i64_to_4f32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    cvtsi2ssq 8(%rdi), %xmm1
-; SSE41-NEXT:    cvtsi2ssq (%rdi), %xmm0
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ssq 16(%rdi), %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm2, %xmm2
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ssq 24(%rdi), %xmm1
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; SSE41-NEXT:    retq
 ;
 ; VEX-LABEL: sitofp_load_4i64_to_4f32:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
-; VEX-NEXT:    vcvtsi2ssq (%rdi), %xmm1, %xmm1
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; VEX-NEXT:    vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; VEX-NEXT:    vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
+; VEX-NEXT:    vmovdqa (%rdi), %xmm0
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; VEX-NEXT:    vmovq %xmm1, %rax
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; VEX-NEXT:    vpextrq $1, %xmm1, %rax
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_4i64_to_4f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vcvtsi2ssq (%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512F-NEXT:    vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT:    vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtsi2ssq (%rdi), %xmm1, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512VL-NEXT:    vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT:    vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX512VL-NEXT:    retq
 ;
@@ -3991,29 +4060,33 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
 ; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
 ; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
-; SSE2-NEXT:    cvtsi2ssq 16(%rdi), %xmm4
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE2-NEXT:    movq %xmm1, %rax
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ssq (%rdi), %xmm0
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm1, %rax
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-NEXT:    movq %xmm3, %rax
 ; SSE2-NEXT:    xorps %xmm4, %xmm4
-; SSE2-NEXT:    cvtsi2ssq 48(%rdi), %xmm4
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm1, %rax
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT:    movq %xmm2, %rax
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ssq 32(%rdi), %xmm1
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm2, %rax
 ; SSE2-NEXT:    xorps %xmm2, %xmm2
@@ -4024,82 +4097,132 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ;
 ; SSE41-LABEL: sitofp_load_8i64_to_8f32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    cvtsi2ssq 8(%rdi), %xmm1
-; SSE41-NEXT:    cvtsi2ssq (%rdi), %xmm0
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ssq 16(%rdi), %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE41-NEXT:    movdqa 32(%rdi), %xmm2
+; SSE41-NEXT:    movdqa 48(%rdi), %xmm3
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm4
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    xorps %xmm4, %xmm4
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm4
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ssq 24(%rdi), %xmm1
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT:    cvtsi2ssq 40(%rdi), %xmm2
+; SSE41-NEXT:    pextrq $1, %xmm2, %rax
+; SSE41-NEXT:    xorps %xmm4, %xmm4
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm4
+; SSE41-NEXT:    movq %xmm2, %rax
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ssq 32(%rdi), %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
+; SSE41-NEXT:    movq %xmm3, %rax
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2ssq 48(%rdi), %xmm2
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
 ; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; SSE41-NEXT:    pextrq $1, %xmm3, %rax
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2ssq 56(%rdi), %xmm2
+; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
 ; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; SSE41-NEXT:    retq
 ;
 ; VEX-LABEL: sitofp_load_8i64_to_8f32:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vcvtsi2ssq 40(%rdi), %xmm0, %xmm0
-; VEX-NEXT:    vcvtsi2ssq 32(%rdi), %xmm1, %xmm1
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; VEX-NEXT:    vcvtsi2ssq 48(%rdi), %xmm2, %xmm1
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; VEX-NEXT:    vcvtsi2ssq 56(%rdi), %xmm2, %xmm1
+; VEX-NEXT:    vmovaps (%rdi), %xmm0
+; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; VEX-NEXT:    vmovdqa 32(%rdi), %xmm2
+; VEX-NEXT:    vmovdqa 48(%rdi), %xmm3
+; VEX-NEXT:    vpextrq $1, %xmm2, %rax
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
+; VEX-NEXT:    vmovq %xmm2, %rax
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm2
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; VEX-NEXT:    vmovq %xmm3, %rax
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; VEX-NEXT:    vpextrq $1, %xmm3, %rax
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; VEX-NEXT:    vpextrq $1, %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; VEX-NEXT:    vmovq %xmm0, %rax
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm0
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; VEX-NEXT:    vmovq %xmm1, %rax
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; VEX-NEXT:    vpextrq $1, %xmm1, %rax
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; VEX-NEXT:    vcvtsi2ssq 8(%rdi), %xmm2, %xmm1
-; VEX-NEXT:    vcvtsi2ssq (%rdi), %xmm2, %xmm2
-; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VEX-NEXT:    vcvtsi2ssq 16(%rdi), %xmm3, %xmm2
-; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VEX-NEXT:    vcvtsi2ssq 24(%rdi), %xmm3, %xmm2
-; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; VEX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; VEX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_8i64_to_8f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtsi2ssq 40(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vcvtsi2ssq 32(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512F-NEXT:    vcvtsi2ssq 48(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT:    vcvtsi2ssq 56(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
+; AVX512F-NEXT:    vmovq %xmm2, %rax
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512F-NEXT:    vmovq %xmm3, %rax
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT:    vcvtsi2ssq 8(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT:    vcvtsi2ssq (%rdi), %xmm2, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512F-NEXT:    vcvtsi2ssq 16(%rdi), %xmm3, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512F-NEXT:    vcvtsi2ssq 24(%rdi), %xmm3, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_8i64_to_8f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtsi2ssq 40(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtsi2ssq 32(%rdi), %xmm1, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512VL-NEXT:    vcvtsi2ssq 48(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT:    vcvtsi2ssq 56(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
+; AVX512VL-NEXT:    vmovq %xmm2, %rax
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512VL-NEXT:    vmovq %xmm3, %rax
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT:    vcvtsi2ssq 8(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT:    vcvtsi2ssq (%rdi), %xmm2, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT:    vcvtsi2ssq 16(%rdi), %xmm3, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512VL-NEXT:    vcvtsi2ssq 24(%rdi), %xmm3, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32:
@@ -4229,7 +4352,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; SSE2-LABEL: uitofp_load_4i64_to_4f32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB83_1
 ; SSE2-NEXT:  # %bb.2:
@@ -4243,23 +4366,23 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    addss %xmm1, %xmm1
 ; SSE2-NEXT:  .LBB83_3:
-; SSE2-NEXT:    movq (%rdi), %rax
+; SSE2-NEXT:    movdqa (%rdi), %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rcx
-; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB83_4
 ; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
 ; SSE2-NEXT:    jmp .LBB83_6
 ; SSE2-NEXT:  .LBB83_4:
-; SSE2-NEXT:    movq %rcx, %rdx
-; SSE2-NEXT:    shrq %rdx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
-; SSE2-NEXT:    addss %xmm2, %xmm2
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
+; SSE2-NEXT:    addss %xmm3, %xmm3
 ; SSE2-NEXT:  .LBB83_6:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
+; SSE2-NEXT:    movq %xmm2, %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB83_7
 ; SSE2-NEXT:  # %bb.8:
@@ -4275,8 +4398,8 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:    addss %xmm0, %xmm0
 ; SSE2-NEXT:  .LBB83_9:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm2, %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB83_10
@@ -4397,23 +4520,35 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ;
 ; AVX512F-LABEL: uitofp_load_4i64_to_4f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtusi2ssq 8(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vcvtusi2ssq (%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512F-NEXT:    vcvtusi2ssq 16(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT:    vcvtusi2ssq 24(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtusi2ssq 8(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtusi2ssq (%rdi), %xmm1, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512VL-NEXT:    vcvtusi2ssq 16(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT:    vcvtusi2ssq 24(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX512VL-NEXT:    retq
 ;
@@ -4566,7 +4701,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; SSE2-LABEL: uitofp_load_8i64_to_8f32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_1
 ; SSE2-NEXT:  # %bb.2:
@@ -4580,23 +4715,23 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
 ; SSE2-NEXT:    addss %xmm2, %xmm2
 ; SSE2-NEXT:  .LBB87_3:
-; SSE2-NEXT:    movq (%rdi), %rax
+; SSE2-NEXT:    movdqa (%rdi), %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rcx
-; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_4
 ; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm1
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    jmp .LBB87_6
 ; SSE2-NEXT:  .LBB87_4:
-; SSE2-NEXT:    movq %rcx, %rdx
-; SSE2-NEXT:    shrq %rdx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm1
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    addss %xmm1, %xmm1
 ; SSE2-NEXT:  .LBB87_6:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
+; SSE2-NEXT:    movq %xmm3, %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_7
 ; SSE2-NEXT:  # %bb.8:
@@ -4612,23 +4747,23 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:    addss %xmm0, %xmm0
 ; SSE2-NEXT:  .LBB87_9:
-; SSE2-NEXT:    movq 48(%rdi), %rax
+; SSE2-NEXT:    movdqa 48(%rdi), %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; SSE2-NEXT:    movq %xmm3, %rcx
-; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_10
 ; SSE2-NEXT:  # %bb.11:
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm4
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
 ; SSE2-NEXT:    jmp .LBB87_12
 ; SSE2-NEXT:  .LBB87_10:
-; SSE2-NEXT:    movq %rcx, %rdx
-; SSE2-NEXT:    shrq %rdx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm4
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
 ; SSE2-NEXT:    addss %xmm4, %xmm4
 ; SSE2-NEXT:  .LBB87_12:
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm5
+; SSE2-NEXT:    movq %xmm6, %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_13
 ; SSE2-NEXT:  # %bb.14:
@@ -4644,27 +4779,27 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
 ; SSE2-NEXT:    addss %xmm3, %xmm3
 ; SSE2-NEXT:  .LBB87_15:
-; SSE2-NEXT:    movq 32(%rdi), %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; SSE2-NEXT:    movq %xmm5, %rcx
-; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; SSE2-NEXT:    movq %xmm6, %rax
+; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_16
 ; SSE2-NEXT:  # %bb.17:
-; SSE2-NEXT:    xorps %xmm5, %xmm5
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm5
+; SSE2-NEXT:    xorps %xmm6, %xmm6
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm6
 ; SSE2-NEXT:    jmp .LBB87_18
 ; SSE2-NEXT:  .LBB87_16:
-; SSE2-NEXT:    movq %rcx, %rdx
-; SSE2-NEXT:    shrq %rdx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    xorps %xmm5, %xmm5
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm5
-; SSE2-NEXT:    addss %xmm5, %xmm5
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    xorps %xmm6, %xmm6
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm6
+; SSE2-NEXT:    addss %xmm6, %xmm6
 ; SSE2-NEXT:  .LBB87_18:
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm4
+; SSE2-NEXT:    movq %xmm5, %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_19
 ; SSE2-NEXT:  # %bb.20:
@@ -4681,8 +4816,8 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; SSE2-NEXT:    addss %xmm1, %xmm1
 ; SSE2-NEXT:  .LBB87_21:
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm2, %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_22
@@ -4886,40 +5021,64 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ;
 ; AVX512F-LABEL: uitofp_load_8i64_to_8f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtusi2ssq 40(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vcvtusi2ssq 32(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512F-NEXT:    vcvtusi2ssq 48(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT:    vcvtusi2ssq 56(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm4
+; AVX512F-NEXT:    vmovq %xmm2, %rax
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm2
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512F-NEXT:    vmovq %xmm3, %rax
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm0
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT:    vcvtusi2ssq 8(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT:    vcvtusi2ssq (%rdi), %xmm2, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512F-NEXT:    vcvtusi2ssq 16(%rdi), %xmm3, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512F-NEXT:    vcvtusi2ssq 24(%rdi), %xmm3, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_8i64_to_8f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtusi2ssq 40(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtusi2ssq 32(%rdi), %xmm1, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512VL-NEXT:    vcvtusi2ssq 48(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT:    vcvtusi2ssq 56(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT:    vmovaps (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX512VL-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm4
+; AVX512VL-NEXT:    vmovq %xmm2, %rax
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm2
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX512VL-NEXT:    vmovq %xmm3, %rax
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm0
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT:    vcvtusi2ssq 8(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT:    vcvtusi2ssq (%rdi), %xmm2, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT:    vcvtusi2ssq 16(%rdi), %xmm3, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512VL-NEXT:    vcvtusi2ssq 24(%rdi), %xmm3, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32:


        


More information about the llvm-commits mailing list