[llvm] r323987 - [X86][SSE] LowerBUILD_VECTORAsVariablePermute - add support for scaling index vectors

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 1 10:10:30 PST 2018


Author: rksimon
Date: Thu Feb  1 10:10:30 2018
New Revision: 323987

URL: http://llvm.org/viewvc/llvm-project?rev=323987&view=rev
Log:
[X86][SSE] LowerBUILD_VECTORAsVariablePermute - add support for scaling index vectors

This allows us to use PSHUFB for v8i16/v4i32 and VPERMD/PERMPS for v4i64/v4f64 variable shuffles.

Differential Revision: https://reviews.llvm.org/D42487

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/var-permute-128.ll
    llvm/trunk/test/CodeGen/X86/var-permute-256.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=323987&r1=323986&r2=323987&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Feb  1 10:10:30 2018
@@ -7818,8 +7818,6 @@ static SDValue materializeVectorConstant
 // TODO: Handle undefs
 // TODO: Utilize pshufb and zero mask blending to support more efficient
 // construction of vectors with constant-0 elements.
-// TODO: Use smaller-element vectors of same width, and "interpolate" the
-// indices, when no native operation available.
 static SDValue
 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
@@ -7833,11 +7831,22 @@ LowerBUILD_VECTORAsVariablePermute(SDVal
       if (Subtarget.hasSSE3())
         Opcode = X86ISD::PSHUFB;
       break;
+    case MVT::v8i16:
+      if (Subtarget.hasVLX() && Subtarget.hasBWI())
+        Opcode = X86ISD::VPERMV;
+      else if (Subtarget.hasSSE3()) {
+        Opcode = X86ISD::PSHUFB;
+        ShuffleVT = MVT::v16i8;
+      }
+      break;
     case MVT::v4f32:
     case MVT::v4i32:
       if (Subtarget.hasAVX()) {
         Opcode = X86ISD::VPERMILPV;
         ShuffleVT = MVT::v4f32;
+      } else if (Subtarget.hasSSE3()) {
+        Opcode = X86ISD::PSHUFB;
+        ShuffleVT = MVT::v16i8;
       }
       break;
     case MVT::v2f64:
@@ -7856,6 +7865,10 @@ LowerBUILD_VECTORAsVariablePermute(SDVal
     case MVT::v4f64:
       if (Subtarget.hasVLX())
         Opcode = X86ISD::VPERMV;
+      else if (Subtarget.hasAVX2()) {
+        Opcode = X86ISD::VPERMV;
+        ShuffleVT = MVT::v8f32;
+      }
       break;
     case MVT::v16f32:
     case MVT::v8f64:
@@ -7868,7 +7881,6 @@ LowerBUILD_VECTORAsVariablePermute(SDVal
       if (Subtarget.hasBWI())
         Opcode = X86ISD::VPERMV;
       break;
-    case MVT::v8i16:
     case MVT::v16i16:
       if (Subtarget.hasVLX() && Subtarget.hasBWI())
         Opcode = X86ISD::VPERMV;
@@ -7927,8 +7939,8 @@ LowerBUILD_VECTORAsVariablePermute(SDVal
   unsigned Opcode = LegalPermuteOpcode(VT, ShuffleVT);
   if (!Opcode)
     return SDValue();
-  assert(VT.getScalarSizeInBits() == ShuffleVT.getScalarSizeInBits() &&
-         VT.getVectorNumElements() == ShuffleVT.getVectorNumElements() &&
+  assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
+         (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
          "Illegal variable permute shuffle type");
 
   unsigned NumElts = VT.getVectorNumElements();
@@ -7950,6 +7962,33 @@ LowerBUILD_VECTORAsVariablePermute(SDVal
                     SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec)));
   }
 
+  uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
+  if (Scale > 1) {
+    assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
+    unsigned ShuffleBits = ShuffleVT.getScalarSizeInBits();
+    uint64_t IndexScale = 0;
+    uint64_t IndexOffset = 0;
+
+    // If we're scaling a smaller permute op, then we need to repeat the indices,
+    // scaling and offsetting them as well.
+    // e.g. v4i32 -> v16i8 (Scale = 4)
+    // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
+    // indexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
+    for (uint64_t i = 0; i != Scale; ++i) {
+      IndexScale |= Scale << (i * ShuffleBits);
+      IndexOffset |= i << (i * ShuffleBits);
+    }
+
+    SDLoc DL(IndicesVec);
+    IndicesVec = DAG.getNode(ISD::MUL, DL, IndicesVT, IndicesVec,
+                             DAG.getConstant(IndexScale, DL, IndicesVT));
+    IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec,
+                             DAG.getConstant(IndexOffset, DL, IndicesVT));
+  }
+
+  EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
+  IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
+
   SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
   SDValue Res =
       Opcode == X86ISD::VPERMV

Modified: llvm/trunk/test/CodeGen/X86/var-permute-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/var-permute-128.ll?rev=323987&r1=323986&r2=323987&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/var-permute-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/var-permute-128.ll Thu Feb  1 10:10:30 2018
@@ -37,25 +37,15 @@ define <2 x i64> @var_shuffle_v2i64(<2 x
 define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
 ; SSSE3-LABEL: var_shuffle_v4i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movd %xmm1, %eax
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
-; SSSE3-NEXT:    movd %xmm2, %ecx
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSSE3-NEXT:    movd %xmm2, %edx
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; SSSE3-NEXT:    movd %xmm1, %esi
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    andl $3, %eax
-; SSSE3-NEXT:    andl $3, %ecx
-; SSSE3-NEXT:    andl $3, %edx
-; SSSE3-NEXT:    andl $3, %esi
-; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pmuludq %xmm2, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: var_shuffle_v4i32:
@@ -80,76 +70,16 @@ define <4 x i32> @var_shuffle_v4i32(<4 x
 define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
 ; SSSE3-LABEL: var_shuffle_v8i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movd %xmm1, %r8d
-; SSSE3-NEXT:    pextrw $1, %xmm1, %r9d
-; SSSE3-NEXT:    pextrw $2, %xmm1, %r10d
-; SSSE3-NEXT:    pextrw $3, %xmm1, %esi
-; SSSE3-NEXT:    pextrw $4, %xmm1, %edi
-; SSSE3-NEXT:    pextrw $5, %xmm1, %eax
-; SSSE3-NEXT:    pextrw $6, %xmm1, %ecx
-; SSSE3-NEXT:    pextrw $7, %xmm1, %edx
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    andl $7, %r8d
-; SSSE3-NEXT:    andl $7, %r9d
-; SSSE3-NEXT:    andl $7, %r10d
-; SSSE3-NEXT:    andl $7, %esi
-; SSSE3-NEXT:    andl $7, %edi
-; SSSE3-NEXT:    andl $7, %eax
-; SSSE3-NEXT:    andl $7, %ecx
-; SSSE3-NEXT:    andl $7, %edx
-; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
-; SSSE3-NEXT:    movd %edx, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm1
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    paddw {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; AVXNOVLBW-LABEL: var_shuffle_v8i16:
 ; AVXNOVLBW:       # %bb.0:
-; AVXNOVLBW-NEXT:    vmovd %xmm1, %eax
-; AVXNOVLBW-NEXT:    vpextrw $1, %xmm1, %r10d
-; AVXNOVLBW-NEXT:    vpextrw $2, %xmm1, %ecx
-; AVXNOVLBW-NEXT:    vpextrw $3, %xmm1, %edx
-; AVXNOVLBW-NEXT:    vpextrw $4, %xmm1, %esi
-; AVXNOVLBW-NEXT:    vpextrw $5, %xmm1, %edi
-; AVXNOVLBW-NEXT:    vpextrw $6, %xmm1, %r8d
-; AVXNOVLBW-NEXT:    vpextrw $7, %xmm1, %r9d
-; AVXNOVLBW-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVXNOVLBW-NEXT:    andl $7, %eax
-; AVXNOVLBW-NEXT:    andl $7, %r10d
-; AVXNOVLBW-NEXT:    andl $7, %ecx
-; AVXNOVLBW-NEXT:    andl $7, %edx
-; AVXNOVLBW-NEXT:    andl $7, %esi
-; AVXNOVLBW-NEXT:    andl $7, %edi
-; AVXNOVLBW-NEXT:    andl $7, %r8d
-; AVXNOVLBW-NEXT:    andl $7, %r9d
-; AVXNOVLBW-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVXNOVLBW-NEXT:    vmovd %eax, %xmm0
-; AVXNOVLBW-NEXT:    vpinsrw $1, -24(%rsp,%r10,2), %xmm0, %xmm0
-; AVXNOVLBW-NEXT:    vpinsrw $2, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVXNOVLBW-NEXT:    vpinsrw $3, -24(%rsp,%rdx,2), %xmm0, %xmm0
-; AVXNOVLBW-NEXT:    vpinsrw $4, -24(%rsp,%rsi,2), %xmm0, %xmm0
-; AVXNOVLBW-NEXT:    vpinsrw $5, -24(%rsp,%rdi,2), %xmm0, %xmm0
-; AVXNOVLBW-NEXT:    vpinsrw $6, -24(%rsp,%r8,2), %xmm0, %xmm0
-; AVXNOVLBW-NEXT:    vpinsrw $7, -24(%rsp,%r9,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVXNOVLBW-NEXT:    vpaddw {{.*}}(%rip), %xmm1, %xmm1
+; AVXNOVLBW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVXNOVLBW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: var_shuffle_v8i16:
@@ -273,25 +203,15 @@ define <2 x double> @var_shuffle_v2f64(<
 define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
 ; SSSE3-LABEL: var_shuffle_v4f32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movd %xmm1, %eax
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
-; SSSE3-NEXT:    movd %xmm2, %ecx
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSSE3-NEXT:    movd %xmm2, %edx
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; SSSE3-NEXT:    movd %xmm1, %esi
-; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    andl $3, %eax
-; SSSE3-NEXT:    andl $3, %ecx
-; SSSE3-NEXT:    andl $3, %edx
-; SSSE3-NEXT:    andl $3, %esi
-; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pmuludq %xmm2, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: var_shuffle_v4f32:

Modified: llvm/trunk/test/CodeGen/X86/var-permute-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/var-permute-256.ll?rev=323987&r1=323986&r2=323987&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/var-permute-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/var-permute-256.ll Thu Feb  1 10:10:30 2018
@@ -38,83 +38,44 @@ define <4 x i64> @var_shuffle_v4i64(<4 x
 ;
 ; AVX2-LABEL: var_shuffle_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    movq %rsp, %rbp
-; AVX2-NEXT:    andq $-32, %rsp
-; AVX2-NEXT:    subq $64, %rsp
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    andl $3, %eax
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vmovq %xmm1, %rdx
-; AVX2-NEXT:    andl $3, %edx
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX2-NEXT:    andl $3, %esi
-; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    movq %rbp, %rsp
-; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
+; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594]
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296]
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: var_shuffle_v4i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    andq $-32, %rsp
-; AVX512F-NEXT:    subq $64, %rsp
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    andl $3, %eax
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX512F-NEXT:    andl $3, %ecx
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm1, %rdx
-; AVX512F-NEXT:    andl $3, %edx
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512F-NEXT:    andl $3, %esi
-; AVX512F-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT:    movq %rbp, %rsp
-; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
+; AVX512F-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594]
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296]
+; AVX512F-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: var_shuffle_v4i64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    movq %rsp, %rbp
-; AVX512DQ-NEXT:    andq $-32, %rsp
-; AVX512DQ-NEXT:    subq $64, %rsp
-; AVX512DQ-NEXT:    vmovq %xmm1, %rax
-; AVX512DQ-NEXT:    andl $3, %eax
-; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX512DQ-NEXT:    andl $3, %ecx
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-NEXT:    vmovq %xmm1, %rdx
-; AVX512DQ-NEXT:    andl $3, %edx
-; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512DQ-NEXT:    andl $3, %esi
-; AVX512DQ-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    movq %rbp, %rsp
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [8589934594,8589934594,8589934594,8589934594]
+; AVX512DQ-NEXT:    vpmullq %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
+; AVX512DQ-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_shuffle_v4i64:
@@ -1348,77 +1309,44 @@ define <4 x double> @var_shuffle_v4f64(<
 ;
 ; AVX2-LABEL: var_shuffle_v4f64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    movq %rsp, %rbp
-; AVX2-NEXT:    andq $-32, %rsp
-; AVX2-NEXT:    subq $64, %rsp
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    andl $3, %eax
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vmovq %xmm1, %rdx
-; AVX2-NEXT:    andl $3, %edx
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX2-NEXT:    andl $3, %esi
-; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    movq %rbp, %rsp
-; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
+; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594]
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296]
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: var_shuffle_v4f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    andq $-32, %rsp
-; AVX512F-NEXT:    subq $64, %rsp
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    andl $3, %eax
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX512F-NEXT:    andl $3, %ecx
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm1, %rdx
-; AVX512F-NEXT:    andl $3, %edx
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512F-NEXT:    andl $3, %esi
-; AVX512F-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT:    movq %rbp, %rsp
-; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
+; AVX512F-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594]
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296]
+; AVX512F-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: var_shuffle_v4f64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    movq %rsp, %rbp
-; AVX512DQ-NEXT:    andq $-32, %rsp
-; AVX512DQ-NEXT:    subq $64, %rsp
-; AVX512DQ-NEXT:    vmovq %xmm1, %rax
-; AVX512DQ-NEXT:    andl $3, %eax
-; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX512DQ-NEXT:    andl $3, %ecx
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-NEXT:    vmovq %xmm1, %rdx
-; AVX512DQ-NEXT:    andl $3, %edx
-; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512DQ-NEXT:    andl $3, %esi
-; AVX512DQ-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    movq %rbp, %rsp
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [8589934594,8589934594,8589934594,8589934594]
+; AVX512DQ-NEXT:    vpmullq %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
+; AVX512DQ-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_shuffle_v4f64:
@@ -1542,65 +1470,47 @@ define <4 x i64> @var_shuffle_v4i64_from
 ;
 ; AVX2-LABEL: var_shuffle_v4i64_from_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vmovq %xmm1, %rdx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
+; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594]
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296]
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: var_shuffle_v4i64_from_v2i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    andl $1, %eax
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX512F-NEXT:    andl $1, %ecx
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm1, %rdx
-; AVX512F-NEXT:    andl $1, %edx
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512F-NEXT:    andl $1, %esi
-; AVX512F-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
+; AVX512F-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594]
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296]
+; AVX512F-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: var_shuffle_v4i64_from_v2i64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovq %xmm1, %rax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-NEXT:    vmovq %xmm1, %rdx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512DQ-NEXT:    andl $1, %esi
-; AVX512DQ-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [8589934594,8589934594,8589934594,8589934594]
+; AVX512DQ-NEXT:    vpmullq %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
+; AVX512DQ-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_shuffle_v4i64_from_v2i64:
@@ -2768,59 +2678,47 @@ define <4 x double> @var_shuffle_v4f64_f
 ;
 ; AVX2-LABEL: var_shuffle_v4f64_from_v2f64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vmovq %xmm1, %rdx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
+; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594]
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296]
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: var_shuffle_v4f64_from_v2f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    andl $1, %eax
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX512F-NEXT:    andl $1, %ecx
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm1, %rdx
-; AVX512F-NEXT:    andl $1, %edx
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512F-NEXT:    andl $1, %esi
-; AVX512F-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
+; AVX512F-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594]
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296]
+; AVX512F-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: var_shuffle_v4f64_from_v2f64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovq %xmm1, %rax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rcx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-NEXT:    vmovq %xmm1, %rdx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512DQ-NEXT:    andl $1, %esi
-; AVX512DQ-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [8589934594,8589934594,8589934594,8589934594]
+; AVX512DQ-NEXT:    vpmullq %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
+; AVX512DQ-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VL-LABEL: var_shuffle_v4f64_from_v2f64:




More information about the llvm-commits mailing list