[llvm] 95ab426 - [X86] Attempt to canonicalize vXf64 SHUFPD shuffle masks with undef elts to improve further folding (#116419)

via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 19 02:45:11 PST 2024


Author: Simon Pilgrim
Date: 2024-11-19T10:45:07Z
New Revision: 95ab42661e8d1f57a4ef8e9d058b44627af0e58d

URL: https://github.com/llvm/llvm-project/commit/95ab42661e8d1f57a4ef8e9d058b44627af0e58d
DIFF: https://github.com/llvm/llvm-project/commit/95ab42661e8d1f57a4ef8e9d058b44627af0e58d.diff

LOG: [X86] Attempt to canonicalize vXf64 SHUFPD shuffle masks with undef elts to improve further folding (#116419)

Currently when creating a SHUFPD immediate mask, any undef shuffle elements are set to 0, which can limit options for further shuffle combining.

This patch attempts to canonicalize the mask to improve folding: first by detecting a per-lane broadcast style mask (which can allow us to fold to UNPCK instead), and second ensure any undef elements are set to an 'inplace' value to improve chances of the SHUFPD later folding to a BLENDPD (or be bypassed in a SimplifyMultipleUseDemandedVectorElts call).

This is very similar to canonicalization we already attempt in getV4X86ShuffleImm for vXi32/vXf32 SHUFPS/SHUFD shuffles.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
    llvm/test/CodeGen/X86/subvector-broadcast.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
    llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
    llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
    llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 009a94daf5854f..dfaa9dc7fe94df 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9926,6 +9926,42 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
   return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
 }
 
+// Canonicalize SHUFPD mask to improve chances of further folding.
+// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
+static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
+  assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
+         "Unexpected SHUFPD mask size");
+  assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
+         "Unexpected SHUFPD mask elements");
+
+  // If the mask only uses one non-undef element, then fully 'splat' it to
+  // improve later broadcast matching.
+  int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
+  assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
+
+  int FirstElt = Mask[FirstIndex];
+  if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
+      count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
+    unsigned Imm = 0;
+    for (unsigned I = 0, E = Mask.size(); I != E; ++I)
+      Imm |= FirstElt << I;
+    return Imm;
+  }
+
+  // Attempt to keep any undef elements in place to improve chances of the
+  // shuffle becoming a (commutative) blend.
+  unsigned Imm = 0;
+  for (unsigned I = 0, E = Mask.size(); I != E; ++I)
+    Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
+
+  return Imm;
+}
+
+static SDValue getSHUFPDImmForMask(ArrayRef<int> Mask, const SDLoc &DL,
+                                   SelectionDAG &DAG) {
+  return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
+}
+
 // The Shuffle result is as follow:
 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
 // Each Zeroable's element correspond to a particular Mask's element.
@@ -14871,7 +14907,7 @@ static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
 
   int LHSMask[4] = {-1, -1, -1, -1};
   int RHSMask[4] = {-1, -1, -1, -1};
-  unsigned SHUFPMask = 0;
+  int SHUFPDMask[4] = {-1, -1, -1, -1};
 
   // As SHUFPD uses a single LHS/RHS element per lane, we can always
   // perform the shuffle once the lanes have been shuffled in place.
@@ -14882,13 +14918,13 @@ static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
     int LaneBase = i & ~1;
     auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
     LaneMask[LaneBase + (M & 1)] = M;
-    SHUFPMask |= (M & 1) << i;
+    SHUFPDMask[i] = M & 1;
   }
 
   SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
   SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
   return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
-                     DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
+                     getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
 }
 
 /// Lower a vector shuffle crossing multiple 128-bit lanes as
@@ -15800,9 +15836,9 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
 
   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
-  ShuffleImm = 0;
-  bool ShufpdMask = true;
-  bool CommutableMask = true;
+  bool IsSHUFPD = true;
+  bool IsCommutable = true;
+  SmallVector<int, 8> SHUFPDMask(NumElts, -1);
   for (int i = 0; i < NumElts; ++i) {
     if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
       continue;
@@ -15811,20 +15847,21 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
     int Val = (i & 6) + NumElts * (i & 1);
     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
     if (Mask[i] < Val || Mask[i] > Val + 1)
-      ShufpdMask = false;
+      IsSHUFPD = false;
     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
-      CommutableMask = false;
-    ShuffleImm |= (Mask[i] % 2) << i;
+      IsCommutable = false;
+    SHUFPDMask[i] = Mask[i] % 2;
   }
 
-  if (!ShufpdMask && !CommutableMask)
+  if (!IsSHUFPD && !IsCommutable)
     return false;
 
-  if (!ShufpdMask && CommutableMask)
+  if (!IsSHUFPD && IsCommutable)
     std::swap(V1, V2);
 
   ForceV1Zero = ZeroLane[0];
   ForceV2Zero = ZeroLane[1];
+  ShuffleImm = getSHUFPDImm(SHUFPDMask);
   return true;
 }
 

diff  --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 4972d3e4ec72bb..cad1d09f11d9c3 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -4756,7 +4756,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
 ; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm3
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[2]
+; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
 ; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2

diff  --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll
index 056c6404f5cfa0..76183ac5f8fa3e 100644
--- a/llvm/test/CodeGen/X86/subvector-broadcast.ll
+++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll
@@ -1667,13 +1667,13 @@ define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(ptr %vp, <8 x float> %default
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vbroadcastsd (%eax), %ymm1
-; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
+; X86-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vbroadcastsd (%rdi), %ymm1
-; X64-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
+; X64-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
 ; X64-NEXT:    retq
   %vec = load <2 x float>, ptr %vp
   %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index e14a12d80f28d5..b7e46e51064c04 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -1002,7 +1002,7 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm4[2,3],ymm3[0,1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm10[0],ymm4[0],ymm10[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm10[0],ymm4[1],ymm10[3],ymm4[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4]
@@ -1038,7 +1038,7 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovaps 16(%rdi), %xmm11
 ; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm13 = ymm12[1],ymm1[0],ymm12[2],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm13 = ymm12[1],ymm1[0],ymm12[2],ymm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[0,1],ymm13[2,0],ymm0[4,5],ymm13[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
@@ -1973,7 +1973,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm7[0,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm1[0,1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm8[0],ymm13[0],ymm8[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm8[0],ymm13[1],ymm8[3],ymm13[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm11[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1990,7 +1990,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 320(%rdi), %ymm12
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[0,1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm4[0],ymm12[0],ymm4[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm4[0],ymm12[1],ymm4[3],ymm12[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7]
 ; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2066,7 +2066,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 16(%rdi), %xmm5
 ; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm6
-; AVX-NEXT:    vshufpd {{.*#+}} ymm8 = ymm6[1],ymm1[0],ymm6[2],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm8 = ymm6[1],ymm1[0],ymm6[2],ymm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm7[0,1],ymm8[2,0],ymm7[4,5],ymm8[6,4]
 ; AVX-NEXT:    vmovaps %ymm7, %ymm13
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3,4,5,6,7]
@@ -2080,7 +2080,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 208(%rdi), %xmm0
 ; AVX-NEXT:    vblendps {{.*#+}} xmm14 = xmm0[0],xmm14[1],xmm0[2,3]
 ; AVX-NEXT:    vmovapd 272(%rdi), %xmm1
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm1[1],ymm15[0],ymm1[2],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm1[1],ymm15[0],ymm1[2],ymm15[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm7[2,0],ymm4[4,5],ymm7[6,4]
 ; AVX-NEXT:    vmovaps %ymm4, %ymm10
 ; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7]
@@ -4025,7 +4025,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm2[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[3],ymm3[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4045,7 +4045,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4071,7 +4071,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[3],ymm4[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4096,7 +4096,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 704(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm10[0,1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm10[0],ymm0[0],ymm10[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm10[0],ymm0[1],ymm10[3],ymm0[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
 ; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4265,7 +4265,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 464(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4283,7 +4283,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 16(%rdi), %xmm15
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3]
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm10
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm10[1],ymm13[0],ymm10[2],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm10[1],ymm13[0],ymm10[2],ymm13[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,0],ymm11[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1]
@@ -4301,7 +4301,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3]
 ; AVX-NEXT:    vmovapd 272(%rdi), %xmm2
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm8[0],ymm2[2],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm8[0],ymm2[2],ymm8[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1]
@@ -4317,7 +4317,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 592(%rdi), %xmm8
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3]
 ; AVX-NEXT:    vmovapd 656(%rdi), %xmm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm9[1],ymm12[0],ymm9[2],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm9[1],ymm12[0],ymm9[2],ymm12[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm4[0,1],ymm14[2,0],ymm4[4,5],ymm14[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm14[2,3,4,5,6,7]
@@ -8052,7 +8052,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8075,7 +8075,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8103,7 +8103,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8130,7 +8130,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8157,7 +8157,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8184,7 +8184,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8211,7 +8211,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8239,7 +8239,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm12[0,1]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm15 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
 ; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8604,7 +8604,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8625,7 +8625,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 272(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
@@ -8646,7 +8646,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 464(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1]
@@ -8665,8 +8665,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 656(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2]
+; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
@@ -8686,7 +8686,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 848(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1]
@@ -8705,8 +8705,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 1040(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2]
+; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
@@ -8726,8 +8726,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX-NEXT:    vmovapd 1232(%rdi), %xmm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2]
+; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,0],ymm15[4,5],ymm1[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1]
@@ -8746,8 +8746,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3]
 ; AVX-NEXT:    vmovapd 1424(%rdi), %xmm9
 ; AVX-NEXT:    vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
-; AVX-NEXT:    # ymm15 = ymm9[1],mem[0],ymm9[2],mem[2]
+; AVX-NEXT:    vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT:    # ymm15 = ymm9[1],mem[0],ymm9[2],mem[3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm9[0,1],ymm15[2,0],ymm9[4,5],ymm15[6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7]

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
index 94e91f546a9a3e..6f534ee9cdf0bc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
@@ -227,7 +227,7 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = mem[0,1],ymm0[2,3]
 ; AVX-NEXT:    vinsertf128 $1, 64(%rdi), %ymm1, %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2],ymm1[3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[3]
 ; AVX-NEXT:    vbroadcastsd 80(%rdi), %ymm4
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3]
@@ -498,10 +498,10 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = mem[0,1],ymm0[2,3]
 ; AVX-NEXT:    vinsertf128 $1, 160(%rdi), %ymm5, %ymm5
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2],ymm5[3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm1[0],ymm3[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm1[0],ymm3[3],ymm1[3]
 ; AVX-NEXT:    vbroadcastsd 80(%rdi), %ymm8
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm0[0],ymm6[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm0[0],ymm6[3],ymm0[3]
 ; AVX-NEXT:    vbroadcastsd 176(%rdi), %ymm8
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3]
@@ -942,16 +942,16 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = mem[0,1],ymm2[2,3]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = mem[0,1],ymm1[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm7[1],ymm5[0],ymm7[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm7[1],ymm5[0],ymm7[3],ymm5[3]
 ; AVX-NEXT:    vbroadcastsd 176(%rdi), %ymm6
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm8[1],ymm2[0],ymm8[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm8[1],ymm2[0],ymm8[3],ymm2[3]
 ; AVX-NEXT:    vbroadcastsd 368(%rdi), %ymm10
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm10 = ymm9[1],ymm1[0],ymm9[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm10 = ymm9[1],ymm1[0],ymm9[3],ymm1[3]
 ; AVX-NEXT:    vbroadcastsd 80(%rdi), %ymm11
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm3[1],ymm0[0],ymm3[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm3[1],ymm0[0],ymm3[3],ymm0[3]
 ; AVX-NEXT:    vbroadcastsd 272(%rdi), %ymm12
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3]
 ; AVX-NEXT:    vmovaps 112(%rdi), %xmm12
@@ -1761,41 +1761,41 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = mem[0,1],ymm13[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm15 = mem[0,1],ymm7[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm12[1],ymm10[0],ymm12[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm12[1],ymm10[0],ymm12[3],ymm10[3]
 ; AVX-NEXT:    vbroadcastsd 176(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm8[0],ymm11[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm8[0],ymm11[3],ymm8[3]
 ; AVX-NEXT:    vbroadcastsd 368(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm3[1],ymm9[0],ymm3[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm3[1],ymm9[0],ymm3[3],ymm9[3]
 ; AVX-NEXT:    vbroadcastsd 560(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[1],ymm13[0],ymm2[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[1],ymm13[0],ymm2[3],ymm13[3]
 ; AVX-NEXT:    vbroadcastsd 752(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm15[1],ymm7[0],ymm15[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm15[1],ymm7[0],ymm15[3],ymm7[3]
 ; AVX-NEXT:    vbroadcastsd 80(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 224(%rdi), %ymm3
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = mem[0,1],ymm3[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm4[1],ymm3[0],ymm4[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm4[1],ymm3[0],ymm4[3],ymm3[3]
 ; AVX-NEXT:    vbroadcastsd 272(%rdi), %ymm2
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = mem[0,1],ymm1[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm6[1],ymm1[0],ymm6[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm6[1],ymm1[0],ymm6[3],ymm1[3]
 ; AVX-NEXT:    vbroadcastsd 464(%rdi), %ymm5
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm5[3]
 ; AVX-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm2
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = mem[0,1],ymm2[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm5[1],ymm2[0],ymm5[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm5[1],ymm2[0],ymm5[3],ymm2[3]
 ; AVX-NEXT:    vbroadcastsd 656(%rdi), %ymm14
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3418,44 +3418,44 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd %ymm0, %ymm2
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[1],ymm6[0],ymm7[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[1],ymm6[0],ymm7[3],ymm6[3]
 ; AVX-NEXT:    vbroadcastsd 176(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm8[1],ymm5[0],ymm8[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm8[1],ymm5[0],ymm8[3],ymm5[3]
 ; AVX-NEXT:    vbroadcastsd 368(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm9[1],ymm4[0],ymm9[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm9[1],ymm4[0],ymm9[3],ymm4[3]
 ; AVX-NEXT:    vbroadcastsd 560(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[1],ymm3[0],ymm10[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[1],ymm3[0],ymm10[3],ymm3[3]
 ; AVX-NEXT:    vbroadcastsd 752(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm2[0],ymm11[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm2[0],ymm11[3],ymm2[3]
 ; AVX-NEXT:    vbroadcastsd 944(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1088(%rdi), %ymm9
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1],ymm9[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm9[0],ymm0[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm9[0],ymm0[3],ymm9[3]
 ; AVX-NEXT:    vbroadcastsd 1136(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1280(%rdi), %ymm8
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1],ymm8[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm8[0],ymm0[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm8[0],ymm0[3],ymm8[3]
 ; AVX-NEXT:    vbroadcastsd 1328(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1472(%rdi), %ymm7
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1],ymm7[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[3],ymm7[3]
 ; AVX-NEXT:    vbroadcastsd 1520(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3463,52 +3463,52 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = mem[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[3]
 ; AVX-NEXT:    vbroadcastsd 80(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 224(%rdi), %ymm13
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1],ymm13[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm13[0],ymm0[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm13[0],ymm0[3],ymm13[3]
 ; AVX-NEXT:    vbroadcastsd 272(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm12
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1],ymm12[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm12[0],ymm0[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm12[0],ymm0[3],ymm12[3]
 ; AVX-NEXT:    vbroadcastsd 464(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm10
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1],ymm10[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[3]
 ; AVX-NEXT:    vbroadcastsd 656(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 800(%rdi), %ymm5
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm14 = mem[0,1],ymm5[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm14[1],ymm5[0],ymm14[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm14[1],ymm5[0],ymm14[3],ymm5[3]
 ; AVX-NEXT:    vbroadcastsd 848(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 992(%rdi), %ymm4
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = mem[0,1],ymm4[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm4[0],ymm11[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm4[0],ymm11[3],ymm4[3]
 ; AVX-NEXT:    vbroadcastsd 1040(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1184(%rdi), %ymm2
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = mem[0,1],ymm2[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[1],ymm2[0],ymm6[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[1],ymm2[0],ymm6[3],ymm2[3]
 ; AVX-NEXT:    vbroadcastsd 1232(%rdi), %ymm3
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1376(%rdi), %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = mem[0,1],ymm1[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm3[1],ymm1[0],ymm3[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm3[1],ymm1[0],ymm3[3],ymm1[3]
 ; AVX-NEXT:    vbroadcastsd 1424(%rdi), %ymm15
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
index 0648d1b4abdf61..07988a416bac4f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
@@ -328,7 +328,7 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovaps 128(%rdi), %xmm8
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[3],ymm9[2]
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovaps 96(%rdi), %xmm4
@@ -337,7 +337,7 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm8
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
@@ -818,12 +818,12 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3]
 ; AVX-NEXT:    vmovaps 128(%rdi), %xmm13
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[3],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[3],ymm14[2]
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3]
 ; AVX-NEXT:    vmovaps 288(%rdi), %xmm14
 ; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm10
-; AVX-NEXT:    vshufpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[3],ymm10[2]
 ; AVX-NEXT:    vmovdqa 208(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3]
@@ -839,13 +839,13 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3],xmm15[4,5,6,7]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[0],ymm12[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[1],ymm12[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3]
 ; AVX-NEXT:    vmovapd 288(%rdi), %ymm12
 ; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[3],ymm12[2]
 ; AVX-NEXT:    vmovdqa 224(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3]
@@ -1789,28 +1789,28 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 288(%rdi), %xmm13
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm15
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[3],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[3],ymm15[2]
 ; AVX-NEXT:    vmovdqa 208(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 608(%rdi), %xmm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 528(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 128(%rdi), %xmm6
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[0],ymm5[3],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[3],ymm11[2]
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm11
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, (%rsp) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 448(%rdi), %xmm2
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm14[0],ymm5[0],ymm14[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm14[0],ymm5[1],ymm14[3],ymm5[2]
 ; AVX-NEXT:    vmovdqa 368(%rdi), %xmm14
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3]
@@ -1842,25 +1842,25 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm2[4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovapd 288(%rdi), %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[3],ymm14[2]
 ; AVX-NEXT:    vmovdqa 224(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm12 = ymm9[0,1],ymm0[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[3],ymm5[2]
 ; AVX-NEXT:    vmovdqa 544(%rdi), %xmm2
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = ymm7[0,1],ymm0[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX-NEXT:    vmovapd 128(%rdi), %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm0[2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
 ; AVX-NEXT:    vmovapd 448(%rdi), %ymm10
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[3],ymm10[2]
 ; AVX-NEXT:    vmovdqa 384(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3]
@@ -3694,14 +3694,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 288(%rdi), %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 208(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm9[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 608(%rdi), %xmm9
 ; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 528(%rdi), %xmm2
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
@@ -3709,7 +3709,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 928(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 848(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm10[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3]
@@ -3717,14 +3717,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 1248(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm6
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[3],ymm6[2]
 ; AVX-NEXT:    vmovdqa 1168(%rdi), %xmm7
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm11[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3]
 ; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 128(%rdi), %xmm8
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm6
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[3],ymm6[2]
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
@@ -3733,7 +3733,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 448(%rdi), %xmm10
 ; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm4
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 368(%rdi), %xmm11
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm12[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3]
@@ -3741,7 +3741,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 768(%rdi), %xmm12
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm4
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 688(%rdi), %xmm13
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm14[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3]
@@ -3749,7 +3749,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 1088(%rdi), %xmm14
 ; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm4
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm0[0],ymm4[0],ymm0[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 1008(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
@@ -3823,14 +3823,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
 ; AVX-NEXT:    vmovapd 288(%rdi), %ymm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm13[0],ymm4[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[3],ymm13[2]
 ; AVX-NEXT:    vmovdqa 224(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm15[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3]
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm4
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm4[0],ymm14[0],ymm4[3],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm4[0],ymm14[1],ymm4[3],ymm14[2]
 ; AVX-NEXT:    vmovdqa 544(%rdi), %xmm11
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm8 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm8 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
@@ -3839,7 +3839,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm5
 ; AVX-NEXT:    vmovapd 928(%rdi), %ymm4
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 864(%rdi), %xmm8
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
@@ -3848,28 +3848,28 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
 ; AVX-NEXT:    vmovapd 1248(%rdi), %ymm4
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 1184(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm9[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovapd 1088(%rdi), %ymm7
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[3],ymm7[2]
 ; AVX-NEXT:    vmovdqa 1024(%rdi), %xmm9
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm1
 ; AVX-NEXT:    vmovapd 768(%rdi), %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2]
 ; AVX-NEXT:    vmovdqa 704(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm12[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 448(%rdi), %ymm3
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[3],ymm3[2]
 ; AVX-NEXT:    vmovdqa 384(%rdi), %xmm2
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
@@ -3877,7 +3877,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 128(%rdi), %ymm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[3],ymm12[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
@@ -7764,7 +7764,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 288(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 208(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -7773,7 +7773,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 608(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 528(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -7782,7 +7782,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 928(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 848(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -7792,7 +7792,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1168(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -7801,7 +7801,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 1568(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1488(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -7809,14 +7809,14 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 1888(%rdi), %xmm11
 ; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1808(%rdi), %xmm12
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps 2208(%rdi), %xmm13
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 2128(%rdi), %xmm14
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
@@ -7824,7 +7824,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 2528(%rdi), %xmm15
 ; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 2448(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -7835,7 +7835,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -7846,7 +7846,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 368(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -7857,7 +7857,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 688(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -7868,7 +7868,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1008(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -7878,7 +7878,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 1408(%rdi), %xmm8
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1328(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
@@ -7887,7 +7887,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 1728(%rdi), %xmm6
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1648(%rdi), %xmm7
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
@@ -7896,7 +7896,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 2048(%rdi), %xmm4
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1968(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
@@ -7905,7 +7905,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps 2368(%rdi), %xmm2
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm1
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 2288(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm9 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
@@ -8065,7 +8065,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
 ; AVX-NEXT:    vmovapd 128(%rdi), %ymm9
 ; AVX-NEXT:    vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[3],ymm9[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm9
 ; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
@@ -8074,7 +8074,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 288(%rdi), %ymm9
 ; AVX-NEXT:    vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[3],ymm9[2]
 ; AVX-NEXT:    vmovdqa 224(%rdi), %xmm9
 ; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
@@ -8084,7 +8084,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
 ; AVX-NEXT:    vmovapd 448(%rdi), %ymm7
 ; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[3],ymm7[2]
 ; AVX-NEXT:    vmovdqa 384(%rdi), %xmm7
 ; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm8[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
@@ -8093,7 +8093,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm7
 ; AVX-NEXT:    vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[3],ymm7[2]
 ; AVX-NEXT:    vmovdqa 544(%rdi), %xmm7
 ; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
@@ -8103,7 +8103,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
 ; AVX-NEXT:    vmovapd 768(%rdi), %ymm6
 ; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[3],ymm6[2]
 ; AVX-NEXT:    vmovdqa 704(%rdi), %xmm6
 ; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
@@ -8112,7 +8112,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 928(%rdi), %ymm5
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[3],ymm5[2]
 ; AVX-NEXT:    vmovdqa 864(%rdi), %xmm5
 ; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
@@ -8122,7 +8122,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
 ; AVX-NEXT:    vmovapd 1088(%rdi), %ymm4
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 1024(%rdi), %xmm4
 ; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
@@ -8131,7 +8131,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 1248(%rdi), %ymm3
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[3],ymm3[2]
 ; AVX-NEXT:    vmovdqa 1184(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
@@ -8141,7 +8141,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm1
 ; AVX-NEXT:    vmovapd 1408(%rdi), %ymm2
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vmovdqa 1344(%rdi), %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
@@ -8149,7 +8149,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 1568(%rdi), %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[3],ymm14[2]
 ; AVX-NEXT:    vmovdqa 1504(%rdi), %xmm13
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
@@ -8157,7 +8157,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm0
 ; AVX-NEXT:    vmovapd 1728(%rdi), %ymm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[3],ymm12[2]
 ; AVX-NEXT:    vmovdqa 1664(%rdi), %xmm11
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
@@ -8165,7 +8165,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 1888(%rdi), %ymm10
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[3],ymm10[2]
 ; AVX-NEXT:    vmovdqa 1824(%rdi), %xmm9
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
@@ -8173,7 +8173,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 2048(%rdi), %ymm8
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[3],ymm8[2]
 ; AVX-NEXT:    vmovdqa 1984(%rdi), %xmm7
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
@@ -8181,7 +8181,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 2208(%rdi), %ymm6
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[3],ymm6[2]
 ; AVX-NEXT:    vmovdqa 2144(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
@@ -8189,7 +8189,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 2368(%rdi), %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 2304(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
@@ -8197,7 +8197,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
 ; AVX-NEXT:    vmovapd 2528(%rdi), %ymm2
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[2]
 ; AVX-NEXT:    vmovdqa 2464(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm15 = xmm15[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3]

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index c5be77db9ecf50..4e5501b1041d3f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -55,7 +55,7 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendps {{.*#+}} xmm4 = mem[0,1],xmm1[2,3]
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm5 = xmm0[0],mem[1]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm6
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
@@ -451,7 +451,7 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm9
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm10 = mem[0],xmm3[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[3],ymm4[2]
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovdqa 128(%rdi), %xmm8
@@ -459,7 +459,7 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm10 = xmm7[0],mem[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[3]
 ; AVX-NEXT:    vmovdqa 192(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
@@ -468,7 +468,7 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm2[0,1,2],ymm8[3]
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm9
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3]
@@ -1225,10 +1225,10 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm14 = xmm13[0],xmm4[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[3],ymm8[2]
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[3],ymm9[2]
 ; AVX-NEXT:    vmovdqa 288(%rdi), %xmm14
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3]
@@ -1243,13 +1243,13 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vmovapd 240(%rdi), %xmm14
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm15 = xmm14[0],mem[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm9 = ymm15[0,1],ymm9[2,3]
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm11[1],ymm10[0],ymm11[2],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm11[1],ymm10[0],ymm11[2],ymm10[3]
 ; AVX-NEXT:    vmovdqa 192(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm13
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3]
 ; AVX-NEXT:    vmovapd 304(%rdi), %xmm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm14[1],ymm13[0],ymm14[2],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm14[1],ymm13[0],ymm14[2],ymm13[3]
 ; AVX-NEXT:    vmovdqa 416(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
@@ -1266,12 +1266,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovapd 192(%rdi), %ymm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm15 = ymm15[0],ymm13[0],ymm15[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm15 = ymm15[0],ymm13[1],ymm15[3],ymm13[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3]
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm15
-; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[3],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[3],ymm15[2]
 ; AVX-NEXT:    vmovdqa 320(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3]
@@ -2817,23 +2817,23 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovdqa 288(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm5[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 384(%rdi), %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[3],ymm5[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 736(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 832(%rdi), %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[3],ymm4[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 160(%rdi), %ymm2
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm6
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm8
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[3],ymm8[2]
 ; AVX-NEXT:    vmovdqa 512(%rdi), %xmm9
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
@@ -2867,28 +2867,28 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 752(%rdi), %xmm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm8 = ymm5[1],ymm12[0],ymm5[2],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm8 = ymm5[1],ymm12[0],ymm5[2],ymm12[3]
 ; AVX-NEXT:    vmovdqa 864(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 304(%rdi), %xmm7
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[3]
 ; AVX-NEXT:    vmovdqa 416(%rdi), %xmm8
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm9 = ymm10[1],ymm11[0],ymm10[2],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm9 = ymm10[1],ymm11[0],ymm10[2],ymm11[3]
 ; AVX-NEXT:    vmovdqa 192(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 528(%rdi), %xmm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[3]
 ; AVX-NEXT:    vmovdqa 640(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
@@ -2917,22 +2917,22 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm4 = mem[0],xmm11[1]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm4[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm2
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm2[0],ymm6[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm6[0],ymm2[1],ymm6[3],ymm2[2]
 ; AVX-NEXT:    vmovdqa 320(%rdi), %xmm6
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3]
 ; AVX-NEXT:    vmovapd 864(%rdi), %ymm4
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 768(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovdqa 544(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 640(%rdi), %ymm3
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[0],ymm7[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[1],ymm7[3],ymm3[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,3]
 ; AVX-NEXT:    vmovapd 192(%rdi), %ymm1
-; AVX-NEXT:    vshufpd {{.*#+}} ymm10 = ymm10[0],ymm1[0],ymm10[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3]
@@ -5858,49 +5858,49 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovdqa 288(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 384(%rdi), %ymm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[0],ymm7[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3]
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 736(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 832(%rdi), %ymm7
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[3],ymm7[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 1184(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1280(%rdi), %ymm6
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[3],ymm6[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 1632(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1728(%rdi), %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm14[0],ymm11[0],ymm14[3],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm14[0],ymm11[1],ymm14[3],ymm11[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 160(%rdi), %ymm8
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[3],ymm8[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm9[0],ymm14[0],ymm9[3],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm9[0],ymm14[1],ymm9[3],ymm14[2]
 ; AVX-NEXT:    vmovdqa 512(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1056(%rdi), %ymm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm15[0],ymm13[0],ymm15[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm15[0],ymm13[1],ymm15[3],ymm13[2]
 ; AVX-NEXT:    vmovdqa 960(%rdi), %xmm15
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1504(%rdi), %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[3],ymm5[2]
 ; AVX-NEXT:    vmovdqa 1408(%rdi), %xmm9
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
@@ -5966,14 +5966,14 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3]
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 304(%rdi), %xmm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm9[0],ymm2[2],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm9[0],ymm2[2],ymm9[3]
 ; AVX-NEXT:    vmovdqa 416(%rdi), %xmm13
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 752(%rdi), %xmm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm10[1],ymm5[0],ymm10[2],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm10[1],ymm5[0],ymm10[2],ymm5[3]
 ; AVX-NEXT:    vmovdqa 864(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -5981,7 +5981,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1200(%rdi), %xmm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 1312(%rdi), %xmm6
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
@@ -5990,7 +5990,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1648(%rdi), %xmm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 1760(%rdi), %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
@@ -6000,21 +6000,21 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1424(%rdi), %xmm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 1536(%rdi), %xmm11
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 976(%rdi), %xmm15
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm15[0],ymm1[2],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm15[0],ymm1[2],ymm15[3]
 ; AVX-NEXT:    vmovdqa 1088(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 528(%rdi), %xmm0
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm8[1],ymm0[0],ymm8[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm8[1],ymm0[0],ymm8[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 640(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
@@ -6023,7 +6023,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm3
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[3]
 ; AVX-NEXT:    vmovdqa 192(%rdi), %xmm7
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm8
@@ -6092,14 +6092,14 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 192(%rdi), %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[3],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[3],ymm11[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm10
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[3],ymm10[2]
 ; AVX-NEXT:    vmovdqa 320(%rdi), %xmm6
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
@@ -6108,11 +6108,11 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 640(%rdi), %ymm5
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[3],ymm5[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 864(%rdi), %ymm7
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[3],ymm7[2]
 ; AVX-NEXT:    vmovdqa 768(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
@@ -6121,11 +6121,11 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1088(%rdi), %ymm3
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[3],ymm3[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1312(%rdi), %ymm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm9[0],ymm1[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm9[1],ymm1[3],ymm9[2]
 ; AVX-NEXT:    vmovdqa 1216(%rdi), %xmm2
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3]
@@ -6133,11 +6133,11 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1536(%rdi), %ymm8
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[3],ymm8[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1760(%rdi), %ymm1
-; AVX-NEXT:    vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[0],ymm12[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[1],ymm12[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 1664(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3]
@@ -12923,40 +12923,40 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 384(%rdi), %ymm2
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[3],ymm2[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 736(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 832(%rdi), %ymm14
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm4[0],ymm14[0],ymm4[3],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm4[0],ymm14[1],ymm4[3],ymm14[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 1184(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1280(%rdi), %ymm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[0],ymm3[3],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[3],ymm13[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 1632(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1728(%rdi), %ymm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm6[0],ymm12[0],ymm6[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm6[0],ymm12[1],ymm6[3],ymm12[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 2080(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 2176(%rdi), %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm7[0],ymm11[0],ymm7[3],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm7[0],ymm11[1],ymm7[3],ymm11[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 2528(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 2624(%rdi), %ymm10
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm8[0],ymm10[0],ymm8[3],ymm10[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm8[0],ymm10[1],ymm8[3],ymm10[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 2976(%rdi), %xmm1
@@ -12964,20 +12964,20 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 3072(%rdi), %ymm2
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovdqa 3424(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 3520(%rdi), %ymm15
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[3],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[3],ymm15[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 160(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -12986,7 +12986,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 608(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 512(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -12996,7 +12996,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 1056(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 960(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -13006,7 +13006,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 1504(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 1408(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
@@ -13014,7 +13014,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1952(%rdi), %ymm9
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[3],ymm9[2]
 ; AVX-NEXT:    vmovdqa 1856(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -13023,7 +13023,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2400(%rdi), %ymm6
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[3],ymm6[2]
 ; AVX-NEXT:    vmovdqa 2304(%rdi), %xmm8
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
@@ -13031,7 +13031,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2848(%rdi), %ymm4
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[3],ymm4[2]
 ; AVX-NEXT:    vmovdqa 2752(%rdi), %xmm5
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
@@ -13039,7 +13039,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 3296(%rdi), %ymm2
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[2]
 ; AVX-NEXT:    vmovdqa 3200(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
@@ -13189,7 +13189,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3]
 ; AVX-NEXT:    vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 80(%rdi), %xmm13
-; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm12[1],ymm13[0],ymm12[2],ymm13[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm11 = ymm12[1],ymm13[0],ymm12[2],ymm13[3]
 ; AVX-NEXT:    vmovdqa 192(%rdi), %xmm12
 ; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
@@ -13199,7 +13199,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 304(%rdi), %xmm2
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[2],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[2],ymm2[3]
 ; AVX-NEXT:    vmovdqa 416(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
@@ -13208,7 +13208,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3]
 ; AVX-NEXT:    vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 528(%rdi), %xmm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm11[0],ymm6[2],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm11[0],ymm6[2],ymm11[3]
 ; AVX-NEXT:    vmovdqa 640(%rdi), %xmm2
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
@@ -13217,7 +13217,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 752(%rdi), %xmm9
 ; AVX-NEXT:    vmovupd %ymm9, (%rsp) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[3]
 ; AVX-NEXT:    vmovdqa 864(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
@@ -13226,7 +13226,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3]
 ; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 976(%rdi), %xmm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm1[1],ymm9[0],ymm1[2],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm6 = ymm1[1],ymm9[0],ymm1[2],ymm9[3]
 ; AVX-NEXT:    vmovdqa 1088(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
@@ -13235,7 +13235,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 1200(%rdi), %xmm6
 ; AVX-NEXT:    vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm6[0],ymm4[2],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm6[0],ymm4[2],ymm6[3]
 ; AVX-NEXT:    vmovdqa 1312(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
@@ -13245,7 +13245,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1424(%rdi), %xmm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm0[0],ymm3[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm0[0],ymm3[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 1536(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
@@ -13254,7 +13254,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1648(%rdi), %xmm7
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[3]
 ; AVX-NEXT:    vmovdqa 1760(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
@@ -13264,7 +13264,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1872(%rdi), %xmm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm5[1],ymm0[0],ymm5[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm3 = ymm5[1],ymm0[0],ymm5[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 1984(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
@@ -13274,7 +13274,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 2096(%rdi), %xmm3
 ; AVX-NEXT:    vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
 ; AVX-NEXT:    vmovdqa 2208(%rdi), %xmm3
 ; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
@@ -13284,7 +13284,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2320(%rdi), %xmm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[2],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[2],ymm0[3]
 ; AVX-NEXT:    vmovdqa 2432(%rdi), %xmm3
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
@@ -13292,7 +13292,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2544(%rdi), %xmm14
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm14[0],ymm0[2],ymm14[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm14[0],ymm0[2],ymm14[3]
 ; AVX-NEXT:    vmovdqa 2656(%rdi), %xmm4
 ; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
@@ -13303,7 +13303,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 2768(%rdi), %xmm4
 ; AVX-NEXT:    vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm0[1],ymm4[0],ymm0[2],ymm4[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm0[1],ymm4[0],ymm0[2],ymm4[3]
 ; AVX-NEXT:    vmovdqa 2880(%rdi), %xmm8
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
@@ -13313,7 +13313,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 2992(%rdi), %xmm5
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
 ; AVX-NEXT:    vmovdqa 3104(%rdi), %xmm5
 ; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
@@ -13324,7 +13324,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 3216(%rdi), %xmm5
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
 ; AVX-NEXT:    vmovdqa 3328(%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
@@ -13335,7 +13335,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 3440(%rdi), %xmm5
 ; AVX-NEXT:    vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm4[1],ymm5[0],ymm4[2],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm5 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
 ; AVX-NEXT:    vmovdqa 3552(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm6 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
@@ -13486,14 +13486,14 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovapd 192(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 96(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 416(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 320(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -13505,12 +13505,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 640(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[3],ymm1[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 864(%rdi), %ymm0
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[3],ymm0[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[3],ymm0[2]
 ; AVX-NEXT:    vmovdqa 768(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -13522,13 +13522,13 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1088(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[3],ymm1[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1312(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 1216(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -13538,13 +13538,13 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovdqa 1440(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1536(%rdi), %ymm15
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm9[0],ymm15[0],ymm9[3],ymm15[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm9[0],ymm15[1],ymm9[3],ymm15[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 1760(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
 ; AVX-NEXT:    vmovdqa 1664(%rdi), %xmm13
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
@@ -13553,11 +13553,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovdqa 1888(%rdi), %xmm0
 ; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 1984(%rdi), %ymm11
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm7[0],ymm11[0],ymm7[3],ymm11[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[3],ymm11[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2208(%rdi), %ymm12
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[0],ymm12[0],ymm10[3],ymm12[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[3],ymm12[2]
 ; AVX-NEXT:    vmovdqa 2112(%rdi), %xmm10
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
@@ -13567,12 +13567,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 2432(%rdi), %ymm9
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm3[0],ymm9[0],ymm3[3],ymm9[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[3],ymm9[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 2656(%rdi), %ymm8
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[3],ymm8[2]
 ; AVX-NEXT:    vmovdqa 2560(%rdi), %xmm7
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
@@ -13582,12 +13582,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 2880(%rdi), %ymm5
-; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[3],ymm5[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm2[0],ymm5[1],ymm2[3],ymm5[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 3104(%rdi), %ymm6
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[3],ymm6[2]
 ; AVX-NEXT:    vmovdqa 3008(%rdi), %xmm4
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
@@ -13597,12 +13597,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; AVX-NEXT:    vmovapd 3328(%rdi), %ymm3
-; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm14[0],ymm3[0],ymm14[3],ymm3[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm2 = ymm14[0],ymm3[1],ymm14[3],ymm3[2]
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
 ; AVX-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovapd 3552(%rdi), %ymm2
 ; AVX-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2]
+; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[2]
 ; AVX-NEXT:    vmovdqa 3456(%rdi), %xmm1
 ; AVX-NEXT:    vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index b93fd4b8f62fbf..779ae33a27715f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -725,7 +725,7 @@ define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) {
 ; ALL-LABEL: shuffle_v4f64_0z3z:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
+; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 0, i32 4, i32 3, i32 4>
   ret <4 x double> %shuffle
@@ -735,7 +735,7 @@ define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) {
 ; ALL-LABEL: shuffle_v4f64_1z2z:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2]
+; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3]
 ; ALL-NEXT:    retq
   %1 = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
   ret <4 x double> %1
@@ -1202,7 +1202,7 @@ define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4i64_2u35:
@@ -1952,7 +1952,7 @@ define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_1z2z:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: shuffle_v4i64_1z2z:
@@ -2038,9 +2038,7 @@ define <4 x double> @add_v4f64_024u_135u_reverse(<4 x double> %a, <4 x double> %
 ; AVX1-LABEL: add_v4f64_024u_135u_reverse:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT:    vshufpd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
-; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: add_v4f64_024u_135u_reverse:

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 97b262cc7ac5c3..a706e4ce00ed3d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -790,7 +790,7 @@ define <8 x double> @shuffle_v8f64_1z2z5z6z(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_1z2z5z6z:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; ALL-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[2],zmm1[3],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
 ; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> <double 0.000000e+00, double undef, double undef, double undef, double undef, double undef, double undef, double undef>, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 5, i32 8, i32 6, i32 8>
   ret <8 x double> %shuffle
@@ -1663,7 +1663,7 @@ define <8 x double> @shuffle_v8f64_z9zbzdzf(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_z9zbzdzf:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[7]
+; ALL-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
 ; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> zeroinitializer, <8 x double> %b, <8 x i32><i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
   ret <8 x double> %shuffle


        


More information about the llvm-commits mailing list