[llvm] [X86] EltsFromConsecutiveLoads - recognise reverse load patterns. (PR #168706)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 19 06:29:38 PST 2025


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/168706

>From 8113a2e8d8e072ec2ce5c304941d61226b09303d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 19 Nov 2025 13:31:25 +0000
Subject: [PATCH 1/2] [X86] EltsFromConsecutiveLoads - recognise reverse load
 patterns.

See if we can create a vector load from the src elements in reverse and then shuffle these back into place.

SLP will (usually) catch this in the middle-end, but there are a few BUILD_VECTOR scalarizations etc. that appear during DAG legalization.

I did start looking at a more general permute fold, but I haven't found any good test examples for this yet - happy to take another look if somebody has examples.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  13 +
 llvm/test/CodeGen/X86/bitcnt-big-integer.ll   |  20 +-
 llvm/test/CodeGen/X86/build-vector-256.ll     |   5 +-
 llvm/test/CodeGen/X86/chain_order.ll          |   3 +-
 .../X86/merge-consecutive-loads-128.ll        | 264 ++++--------------
 .../X86/merge-consecutive-loads-256.ll        | 229 +++++----------
 .../X86/merge-consecutive-loads-512.ll        | 127 +--------
 7 files changed, 161 insertions(+), 500 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aa9ba6b0e197c..661eace037de9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7557,6 +7557,19 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
     }
   }
 
+  // REVERSE - attempt to match the loads in reverse and then shuffle back.
+  // TODO: Do this for any permute or mismatching element counts.
+  if (Depth == 0 && !ZeroMask && TLI.isTypeLegal(VT) && VT.isVector() &&
+      NumElems == VT.getVectorNumElements()) {
+    SmallVector<SDValue, 4> ReverseElts(Elts.rbegin(), Elts.rend());
+    if (SDValue RevLd = EltsFromConsecutiveLoads(
+            VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) {
+      SmallVector<int, 16> ReverseMask(NumElems);
+      std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0);
+      return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask);
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index 330c978d2a9f7..22c4ad28059e4 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -844,13 +844,11 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind {
 ; AVX512-NEXT:    vmovq %rcx, %xmm2
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovq %r8, %xmm1
-; AVX512-NEXT:    vmovq %r9, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512-NEXT:    vmovq %r8, %xmm2
+; AVX512-NEXT:    vmovq %r9, %xmm3
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vplzcntq %zmm0, %zmm1
 ; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
@@ -2071,13 +2069,11 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
 ; AVX512-NEXT:    vmovq %rcx, %xmm2
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovq %r8, %xmm1
-; AVX512-NEXT:    vmovq %r9, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512-NEXT:    vmovq %r8, %xmm2
+; AVX512-NEXT:    vmovq %r9, %xmm3
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512-NEXT:    vplzcntq %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll
index 3edb712e53c8d..773eb8f6742e5 100644
--- a/llvm/test/CodeGen/X86/build-vector-256.ll
+++ b/llvm/test/CodeGen/X86/build-vector-256.ll
@@ -417,9 +417,8 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) {
 ; AVX1-32-LABEL: test_buildvector_4f64_2_var:
 ; AVX1-32:       # %bb.0:
-; AVX1-32-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
-; AVX1-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-32-NEXT:    vmovupd {{[0-9]+}}(%esp), %xmm0
+; AVX1-32-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX1-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-32-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/chain_order.ll b/llvm/test/CodeGen/X86/chain_order.ll
index 3ced27f12c72a..18faec5747abe 100644
--- a/llvm/test/CodeGen/X86/chain_order.ll
+++ b/llvm/test/CodeGen/X86/chain_order.ll
@@ -6,9 +6,8 @@ define void @cftx020(ptr nocapture %a) {
 ; CHECK-LABEL: cftx020:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0]
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovupd (%rdi), %xmm1
 ; CHECK-NEXT:    vmovupd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 26f076d450c15..b6aae486dc315 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -354,53 +354,23 @@ define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp
 }
 
 define <4 x float> @merge_v4f32_f32_3210(ptr %ptr) nounwind uwtable noinline ssp {
-; SSE2-LABEL: merge_v4f32_f32_3210:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: merge_v4f32_f32_3210:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; SSE41-NEXT:    retq
+; SSE-LABEL: merge_v4f32_f32_3210:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movups (%rdi), %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: merge_v4f32_f32_3210:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
 ; AVX-NEXT:    retq
 ;
-; X86-SSE1-LABEL: merge_v4f32_f32_3210:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE41-LABEL: merge_v4f32_f32_3210:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X86-SSE41-NEXT:    retl
+; X86-SSE-LABEL: merge_v4f32_f32_3210:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movups (%eax), %xmm0
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X86-SSE-NEXT:    retl
   %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3
   %ptr1 = getelementptr inbounds float, ptr %ptr, i64 2
   %ptr2 = getelementptr inbounds float, ptr %ptr, i64 1
@@ -788,31 +758,15 @@ define <4 x i32> @merge_4i32_i32_45zz_inc5(ptr %ptr) nounwind uwtable noinline s
 }
 
 define <4 x i32> @merge_v4i32_i32_3210(ptr %ptr) nounwind uwtable noinline ssp {
-; SSE2-LABEL: merge_v4i32_i32_3210:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: merge_v4i32_i32_3210:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT:    pinsrd $1, 8(%rdi), %xmm0
-; SSE41-NEXT:    pinsrd $2, 4(%rdi), %xmm0
-; SSE41-NEXT:    pinsrd $3, (%rdi), %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: merge_v4i32_i32_3210:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqu (%rdi), %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: merge_v4i32_i32_3210:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vpinsrd $1, 8(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrd $2, 4(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrd $3, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
 ; AVX-NEXT:    retq
 ;
 ; X86-SSE1-LABEL: merge_v4i32_i32_3210:
@@ -842,10 +796,8 @@ define <4 x i32> @merge_v4i32_i32_3210(ptr %ptr) nounwind uwtable noinline ssp {
 ; X86-SSE41-LABEL: merge_v4i32_i32_3210:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE41-NEXT:    pinsrd $1, 8(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrd $2, 4(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrd $3, (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
 ; X86-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3
   %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 2
@@ -1003,55 +955,22 @@ define <8 x i16> @merge_8i16_i16_45u7zzzz(ptr %ptr) nounwind uwtable noinline ss
 define <8 x i16> @merge_8i16_i16_76543210(ptr %ptr) nounwind uwtable noinline ssp {
 ; SSE2-LABEL: merge_8i16_i16_76543210:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movzwl (%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzwl 2(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movzwl 4(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzwl 6(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movzwl 8(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzwl 10(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movzwl 12(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    movzwl 14(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    movdqu (%rdi), %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: merge_8i16_i16_76543210:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movzwl 14(%rdi), %eax
-; SSE41-NEXT:    movd %eax, %xmm0
-; SSE41-NEXT:    pinsrw $1, 12(%rdi), %xmm0
-; SSE41-NEXT:    pinsrw $2, 10(%rdi), %xmm0
-; SSE41-NEXT:    pinsrw $3, 8(%rdi), %xmm0
-; SSE41-NEXT:    pinsrw $4, 6(%rdi), %xmm0
-; SSE41-NEXT:    pinsrw $5, 4(%rdi), %xmm0
-; SSE41-NEXT:    pinsrw $6, 2(%rdi), %xmm0
-; SSE41-NEXT:    pinsrw $7, (%rdi), %xmm0
+; SSE41-NEXT:    movdqu (%rdi), %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: merge_8i16_i16_76543210:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movzwl 14(%rdi), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vpinsrw $1, 12(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $2, 10(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $3, 8(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $4, 6(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $5, 4(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $6, 2(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $7, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
 ; AVX-NEXT:    retq
 ;
 ; X86-SSE1-LABEL: merge_8i16_i16_76543210:
@@ -1107,15 +1026,8 @@ define <8 x i16> @merge_8i16_i16_76543210(ptr %ptr) nounwind uwtable noinline ss
 ; X86-SSE41-LABEL: merge_8i16_i16_76543210:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movzwl 14(%eax), %ecx
-; X86-SSE41-NEXT:    movd %ecx, %xmm0
-; X86-SSE41-NEXT:    pinsrw $1, 12(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrw $2, 10(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrw $3, 8(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrw $4, 6(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrw $5, 4(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrw $6, 2(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrw $7, (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
 ; X86-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 7
   %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 6
@@ -1341,95 +1253,30 @@ define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(ptr %ptr) nounwind uwtable noin
 define <16 x i8> @merge_16i8_i8_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp {
 ; SSE2-LABEL: merge_16i8_i8_FEDCBA9876543210:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movzbl (%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 1(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movzbl 2(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 3(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    movzbl 4(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 5(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movzbl 6(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 7(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movzbl 8(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 9(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movzbl 10(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 11(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT:    movzbl 12(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 13(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movzbl 14(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movzbl 15(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movdqu (%rdi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movzbl 15(%rdi), %eax
-; SSE41-NEXT:    movd %eax, %xmm0
-; SSE41-NEXT:    pinsrb $1, 14(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $2, 13(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $3, 12(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $4, 11(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $5, 10(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $6, 9(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $7, 8(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $8, 7(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $9, 6(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $10, 5(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $11, 4(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $12, 3(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $13, 2(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $14, 1(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $15, (%rdi), %xmm0
+; SSE41-NEXT:    movdqu (%rdi), %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: merge_16i8_i8_FEDCBA9876543210:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movzbl 15(%rdi), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vpinsrb $1, 14(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $2, 13(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $3, 12(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $4, 11(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $5, 10(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $6, 9(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $7, 8(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $8, 7(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $9, 6(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $10, 5(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $11, 4(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $12, 3(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $13, 2(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $14, 1(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $15, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; AVX-NEXT:    retq
 ;
 ; X86-SSE1-LABEL: merge_16i8_i8_FEDCBA9876543210:
@@ -1507,23 +1354,8 @@ define <16 x i8> @merge_16i8_i8_FEDCBA9876543210(ptr %ptr) nounwind uwtable noin
 ; X86-SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movzbl 15(%eax), %ecx
-; X86-SSE41-NEXT:    movd %ecx, %xmm0
-; X86-SSE41-NEXT:    pinsrb $1, 14(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $2, 13(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $3, 12(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $4, 11(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $5, 10(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $6, 9(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $7, 8(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $8, 7(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $9, 6(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $10, 5(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $11, 4(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $12, 3(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $13, 2(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $14, 1(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrb $15, (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X86-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i8, ptr %ptr, i64 15
   %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 14
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll
index e5e99e17053a0..6ad306d2e6564 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll
@@ -127,23 +127,27 @@ define <4 x double> @merge_4f64_f64_45zz(ptr %ptr) nounwind uwtable noinline ssp
 }
 
 define <4 x double> @merge_v4f64_f64_3210(ptr %ptr) nounwind uwtable noinline ssp {
-; AVX-LABEL: merge_v4f64_f64_3210:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; AVX-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: merge_v4f64_f64_3210:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: merge_v4f64_f64_3210:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: merge_v4f64_f64_3210:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512F-NEXT:    retq
 ;
 ; X86-AVX-LABEL: merge_v4f64_f64_3210:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; X86-AVX-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; X86-AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
 ; X86-AVX-NEXT:    retl
   %ptr0 = getelementptr inbounds double, ptr %ptr, i64 3
   %ptr1 = getelementptr inbounds double, ptr %ptr, i64 2
@@ -269,16 +273,21 @@ define <4 x i64> @merge_4i64_i64_23zz(ptr %ptr) nounwind uwtable noinline ssp {
 }
 
 define <4 x i64> @merge_v4i64_i64_3210(ptr %ptr) nounwind uwtable noinline ssp {
-; AVX-LABEL: merge_v4i64_i64_3210:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: merge_v4i64_i64_3210:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: merge_v4i64_i64_3210:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: merge_v4i64_i64_3210:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512F-NEXT:    retq
 ;
 ; X86-AVX-LABEL: merge_v4i64_i64_3210:
 ; X86-AVX:       # %bb.0:
@@ -410,31 +419,29 @@ define <8 x float> @merge_8f32_f32_1u3u5zu8(ptr %ptr) nounwind uwtable noinline
 }
 
 define <8 x float> @merge_8f32_f32_76543210(ptr %ptr) nounwind uwtable noinline ssp {
-; AVX-LABEL: merge_8f32_f32_76543210:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: merge_8f32_f32_76543210:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: merge_8f32_f32_76543210:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: merge_8f32_f32_76543210:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT:    retq
 ;
 ; X86-AVX-LABEL: merge_8f32_f32_76543210:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; X86-AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; X86-AVX-NEXT:    retl
   %ptr0 = getelementptr inbounds float, ptr %ptr, i64 7
   %ptr1 = getelementptr inbounds float, ptr %ptr, i64 6
@@ -545,55 +552,27 @@ define <8 x i32> @merge_8i32_i32_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ss
 define <8 x i32> @merge_8i32_i32_76543210(ptr %ptr) nounwind uwtable noinline ssp {
 ; AVX1-LABEL: merge_8i32_i32_76543210:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vpinsrd $1, 8(%rdi), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrd $2, 4(%rdi), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrd $3, (%rdi), %xmm0, %xmm0
-; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vpinsrd $1, 24(%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vpinsrd $2, 20(%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vpinsrd $3, 16(%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: merge_8i32_i32_76543210:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpinsrd $1, 8(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrd $2, 4(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrd $3, (%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpinsrd $1, 24(%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vpinsrd $2, 20(%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vpinsrd $3, 16(%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: merge_8i32_i32_76543210:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-NEXT:    vpinsrd $1, 8(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrd $2, 4(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrd $3, (%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512F-NEXT:    vpinsrd $1, 24(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vpinsrd $2, 20(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vpinsrd $3, 16(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX512F-NEXT:    retq
 ;
 ; X86-AVX-LABEL: merge_8i32_i32_76543210:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vpinsrd $1, 8(%eax), %xmm0, %xmm0
-; X86-AVX-NEXT:    vpinsrd $2, 4(%eax), %xmm0, %xmm0
-; X86-AVX-NEXT:    vpinsrd $3, (%eax), %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vpinsrd $1, 24(%eax), %xmm1, %xmm1
-; X86-AVX-NEXT:    vpinsrd $2, 20(%eax), %xmm1, %xmm1
-; X86-AVX-NEXT:    vpinsrd $3, 16(%eax), %xmm1, %xmm1
-; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; X86-AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
 ; X86-AVX-NEXT:    retl
   %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 7
   %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 6
@@ -733,94 +712,36 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable n
 define <16 x i16> @merge_16i16_i16_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp {
 ; AVX1-LABEL: merge_16i16_i16_FEDCBA9876543210:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    movzwl 14(%rdi), %eax
-; AVX1-NEXT:    vmovd %eax, %xmm0
-; AVX1-NEXT:    vpinsrw $1, 12(%rdi), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrw $2, 10(%rdi), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrw $3, 8(%rdi), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrw $4, 6(%rdi), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrw $5, 4(%rdi), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrw $6, 2(%rdi), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrw $7, (%rdi), %xmm0, %xmm0
-; AVX1-NEXT:    movzwl 30(%rdi), %eax
-; AVX1-NEXT:    vmovd %eax, %xmm1
-; AVX1-NEXT:    vpinsrw $1, 28(%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vpinsrw $2, 26(%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vpinsrw $3, 24(%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vpinsrw $4, 22(%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vpinsrw $5, 20(%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vpinsrw $6, 18(%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vpinsrw $7, 16(%rdi), %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: merge_16i16_i16_FEDCBA9876543210:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    movzwl 14(%rdi), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    vpinsrw $1, 12(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrw $2, 10(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrw $3, 8(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrw $4, 6(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrw $5, 4(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrw $6, 2(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrw $7, (%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    movzwl 30(%rdi), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpinsrw $1, 28(%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vpinsrw $2, 26(%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vpinsrw $3, 24(%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vpinsrw $4, 22(%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vpinsrw $5, 20(%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vpinsrw $6, 18(%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vpinsrw $7, 16(%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: merge_16i16_i16_FEDCBA9876543210:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movzwl 14(%rdi), %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm0
-; AVX512F-NEXT:    vpinsrw $1, 12(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrw $2, 10(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrw $3, 8(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrw $4, 6(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrw $5, 4(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrw $6, 2(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrw $7, (%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    movzwl 30(%rdi), %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm1
-; AVX512F-NEXT:    vpinsrw $1, 28(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vpinsrw $2, 26(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vpinsrw $3, 24(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vpinsrw $4, 22(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vpinsrw $5, 20(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vpinsrw $6, 18(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vpinsrw $7, 16(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17]
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX512F-NEXT:    retq
 ;
 ; X86-AVX-LABEL: merge_16i16_i16_FEDCBA9876543210:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movzwl 14(%eax), %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm0
-; X86-AVX-NEXT:    vpinsrw $1, 12(%eax), %xmm0, %xmm0
-; X86-AVX-NEXT:    vpinsrw $2, 10(%eax), %xmm0, %xmm0
-; X86-AVX-NEXT:    vpinsrw $3, 8(%eax), %xmm0, %xmm0
-; X86-AVX-NEXT:    vpinsrw $4, 6(%eax), %xmm0, %xmm0
-; X86-AVX-NEXT:    vpinsrw $5, 4(%eax), %xmm0, %xmm0
-; X86-AVX-NEXT:    vpinsrw $6, 2(%eax), %xmm0, %xmm0
-; X86-AVX-NEXT:    vpinsrw $7, (%eax), %xmm0, %xmm0
-; X86-AVX-NEXT:    movzwl 30(%eax), %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm1
-; X86-AVX-NEXT:    vpinsrw $1, 28(%eax), %xmm1, %xmm1
-; X86-AVX-NEXT:    vpinsrw $2, 26(%eax), %xmm1, %xmm1
-; X86-AVX-NEXT:    vpinsrw $3, 24(%eax), %xmm1, %xmm1
-; X86-AVX-NEXT:    vpinsrw $4, 22(%eax), %xmm1, %xmm1
-; X86-AVX-NEXT:    vpinsrw $5, 20(%eax), %xmm1, %xmm1
-; X86-AVX-NEXT:    vpinsrw $6, 18(%eax), %xmm1, %xmm1
-; X86-AVX-NEXT:    vpinsrw $7, 16(%eax), %xmm1, %xmm1
+; X86-AVX-NEXT:    vmovdqu (%eax), %xmm0
+; X86-AVX-NEXT:    vmovdqu 16(%eax), %xmm1
+; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; X86-AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X86-AVX-NEXT:    retl
   %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 15
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
index fabca0ea5007e..f9a0bd7f424d6 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
@@ -151,33 +151,15 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline
 define <8 x double> @merge_8f64_f64_76543210(ptr %ptr) nounwind uwtable noinline ssp {
 ; ALL-LABEL: merge_8f64_f64_76543210:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; ALL-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; ALL-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; ALL-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; ALL-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; ALL-NEXT:    vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1]
-; ALL-NEXT:    vmovhps {{.*#+}} xmm2 = xmm3[0,1],mem[0,1]
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; ALL-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
 ; ALL-NEXT:    retq
 ;
 ; X86-AVX512F-LABEL: merge_8f64_f64_76543210:
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512F-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; X86-AVX512F-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; X86-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512F-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X86-AVX512F-NEXT:    vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
-; X86-AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; X86-AVX512F-NEXT:    vpermpd (%eax), %zmm0, %zmm0
 ; X86-AVX512F-NEXT:    retl
   %ptr0 = getelementptr inbounds double, ptr %ptr, i64 7
   %ptr1 = getelementptr inbounds double, ptr %ptr, i64 6
@@ -288,21 +270,8 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ss
 define <8 x i64> @merge_8i64_i64_76543210(ptr %ptr) nounwind uwtable noinline ssp {
 ; ALL-LABEL: merge_8i64_i64_76543210:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; ALL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; ALL-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; ALL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; ALL-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; ALL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; ALL-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; ALL-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; ALL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; ALL-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
 ; ALL-NEXT:    retq
 ;
 ; X86-AVX512F-LABEL: merge_8i64_i64_76543210:
@@ -466,49 +435,15 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable
 define <16 x float> @merge_16f32_f32_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp {
 ; ALL-LABEL: merge_16f32_f32_FEDCBA9876543210:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ALL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; ALL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; ALL-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[2,3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[2,3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; ALL-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
 ; ALL-NEXT:    retq
 ;
 ; X86-AVX512F-LABEL: merge_16f32_f32_FEDCBA9876543210:
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; X86-AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; X86-AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X86-AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; X86-AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; X86-AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; X86-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; X86-AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; X86-AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; X86-AVX512F-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; X86-AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; X86-AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
-; X86-AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; X86-AVX512F-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; X86-AVX512F-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
 ; X86-AVX512F-NEXT:    retl
   %ptr0 = getelementptr inbounds float, ptr %ptr, i64 15
   %ptr1 = getelementptr inbounds float, ptr %ptr, i64 14
@@ -672,49 +607,15 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable n
 define <16 x i32> @merge_16i32_i32_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp {
 ; ALL-LABEL: merge_16i32_i32_FEDCBA9876543210:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ALL-NEXT:    vpinsrd $1, 8(%rdi), %xmm0, %xmm0
-; ALL-NEXT:    vpinsrd $2, 4(%rdi), %xmm0, %xmm0
-; ALL-NEXT:    vpinsrd $3, (%rdi), %xmm0, %xmm0
-; ALL-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; ALL-NEXT:    vpinsrd $1, 24(%rdi), %xmm1, %xmm1
-; ALL-NEXT:    vpinsrd $2, 20(%rdi), %xmm1, %xmm1
-; ALL-NEXT:    vpinsrd $3, 16(%rdi), %xmm1, %xmm1
-; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; ALL-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; ALL-NEXT:    vpinsrd $1, 40(%rdi), %xmm1, %xmm1
-; ALL-NEXT:    vpinsrd $2, 36(%rdi), %xmm1, %xmm1
-; ALL-NEXT:    vpinsrd $3, 32(%rdi), %xmm1, %xmm1
-; ALL-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; ALL-NEXT:    vpinsrd $1, 56(%rdi), %xmm2, %xmm2
-; ALL-NEXT:    vpinsrd $2, 52(%rdi), %xmm2, %xmm2
-; ALL-NEXT:    vpinsrd $3, 48(%rdi), %xmm2, %xmm2
-; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; ALL-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
 ; ALL-NEXT:    retq
 ;
 ; X86-AVX512F-LABEL: merge_16i32_i32_FEDCBA9876543210:
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX512F-NEXT:    vpinsrd $1, 8(%eax), %xmm0, %xmm0
-; X86-AVX512F-NEXT:    vpinsrd $2, 4(%eax), %xmm0, %xmm0
-; X86-AVX512F-NEXT:    vpinsrd $3, (%eax), %xmm0, %xmm0
-; X86-AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX512F-NEXT:    vpinsrd $1, 24(%eax), %xmm1, %xmm1
-; X86-AVX512F-NEXT:    vpinsrd $2, 20(%eax), %xmm1, %xmm1
-; X86-AVX512F-NEXT:    vpinsrd $3, 16(%eax), %xmm1, %xmm1
-; X86-AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX512F-NEXT:    vpinsrd $1, 40(%eax), %xmm1, %xmm1
-; X86-AVX512F-NEXT:    vpinsrd $2, 36(%eax), %xmm1, %xmm1
-; X86-AVX512F-NEXT:    vpinsrd $3, 32(%eax), %xmm1, %xmm1
-; X86-AVX512F-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-AVX512F-NEXT:    vpinsrd $1, 56(%eax), %xmm2, %xmm2
-; X86-AVX512F-NEXT:    vpinsrd $2, 52(%eax), %xmm2, %xmm2
-; X86-AVX512F-NEXT:    vpinsrd $3, 48(%eax), %xmm2, %xmm2
-; X86-AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X86-AVX512F-NEXT:    vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; X86-AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
 ; X86-AVX512F-NEXT:    retl
   %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 15
   %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 14

>From d74a742fd2d59ca121a3d16c6ee4c3d63ca79186 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 19 Nov 2025 14:29:22 +0000
Subject: [PATCH 2/2] Matching SmallVector stack sizes

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 661eace037de9..40970e4d8595f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7561,7 +7561,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   // TODO: Do this for any permute or mismatching element counts.
   if (Depth == 0 && !ZeroMask && TLI.isTypeLegal(VT) && VT.isVector() &&
       NumElems == VT.getVectorNumElements()) {
-    SmallVector<SDValue, 4> ReverseElts(Elts.rbegin(), Elts.rend());
+    SmallVector<SDValue, 16> ReverseElts(Elts.rbegin(), Elts.rend());
     if (SDValue RevLd = EltsFromConsecutiveLoads(
             VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) {
       SmallVector<int, 16> ReverseMask(NumElems);



More information about the llvm-commits mailing list