[llvm-branch-commits] [llvm] DAG: Handle load in SimplifyDemandedVectorElts (PR #122671)

Matt Arsenault via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Jan 29 08:33:58 PST 2025


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/122671

>From 8f15ec9984e0a72b5036588d157c34e59e61d936 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 13 Jan 2025 11:22:55 +0700
Subject: [PATCH] DAG: Handle load in SimplifyDemandedVectorElts

This improves some AMDGPU cases and avoids future regressions.
The combiner likes to form shuffles for cases where an extract_vector_elt
would do perfectly well, and this recovers some of the regressions from
losing load narrowing.

AMDGPU, Arch64 and RISCV test changes look broadly better. Other targets have
some improvements, but mostly regressions. In particular X86 looks much
worse. I'm guessing this is because it's shouldReduceLoadWidth is wrong.

I mostly just regenerated the checks. I assume some set of them should
switch to use volatile loads to defeat the optimization.
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   31 +
 .../AArch64/arm64-big-endian-bitconverts.ll   |   54 +-
 .../AArch64/dag-ReplaceAllUsesOfValuesWith.ll |    5 +-
 llvm/test/CodeGen/AArch64/fcmp.ll             |   43 +-
 llvm/test/CodeGen/AArch64/fmlal-loreg.ll      |    8 +-
 llvm/test/CodeGen/AArch64/icmp.ll             |   17 +-
 .../sve-fixed-length-extract-vector-elt.ll    |  114 +-
 .../AArch64/sve-fixed-length-masked-gather.ll |    3 +-
 .../sve-fixed-length-masked-scatter.ll        |   10 +-
 ...ng-mode-fixed-length-extract-vector-elt.ll |   30 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll     |   39 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll     |   38 +-
 .../AMDGPU/greedy-reverse-local-assignment.ll |   21 +-
 .../identical-subrange-spill-infloop.ll       |    6 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |   24 +-
 .../AMDGPU/shader-addr64-nonuniform.ll        |    8 +-
 llvm/test/CodeGen/AMDGPU/trunc.ll             |    4 +-
 .../test/CodeGen/AMDGPU/vector_rebroadcast.ll | 2657 +++++++++--------
 .../CodeGen/AMDGPU/vector_shuffle.packed.ll   |  250 +-
 .../ARM/crash-on-pow2-shufflevector.ll        |    5 +-
 llvm/test/CodeGen/ARM/vector-promotion.ll     |   30 +-
 llvm/test/CodeGen/ARM/vext.ll                 |    7 +-
 llvm/test/CodeGen/ARM/vuzp.ll                 |   14 +-
 llvm/test/CodeGen/Mips/cconv/vector.ll        |   65 +-
 .../test/CodeGen/Mips/msa/basic_operations.ll |   96 +-
 llvm/test/CodeGen/NVPTX/i128.ll               |   10 +-
 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll  |   48 +-
 .../PowerPC/aix-vector-byval-callee.ll        |    4 +-
 .../PowerPC/canonical-merge-shuffles.ll       |    9 +-
 llvm/test/CodeGen/PowerPC/const-stov.ll       |   15 +-
 llvm/test/CodeGen/PowerPC/pr27078.ll          |   22 +-
 llvm/test/CodeGen/PowerPC/pre-inc-disable.ll  |   28 +-
 .../PowerPC/v16i8_scalar_to_vector_shuffle.ll |  120 +-
 .../PowerPC/v2i64_scalar_to_vector_shuffle.ll |  212 +-
 .../PowerPC/v8i16_scalar_to_vector_shuffle.ll |    6 +-
 llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll   |  104 +-
 .../RISCV/rvv/fixed-vectors-fp-shuffles.ll    |   27 +-
 llvm/test/CodeGen/Thumb2/mve-extractstore.ll  |   24 +-
 .../CodeGen/Thumb2/mve-insertshuffleload.ll   |   48 +-
 llvm/test/CodeGen/X86/SwizzleShuff.ll         |    2 +-
 llvm/test/CodeGen/X86/avx-vbroadcast.ll       |    6 +-
 llvm/test/CodeGen/X86/avx.ll                  |    6 +-
 .../CodeGen/X86/avx1-logical-load-folding.ll  |   28 +-
 llvm/test/CodeGen/X86/avx512-arith.ll         |    4 +-
 .../CodeGen/X86/avx512-broadcast-arith.ll     |   12 +-
 llvm/test/CodeGen/X86/avx512-calling-conv.ll  |    4 +-
 llvm/test/CodeGen/X86/avx512-cmp.ll           |    2 +-
 llvm/test/CodeGen/X86/avx512-ext.ll           |   18 +-
 .../avx512-extract-subvector-load-store.ll    |   12 +-
 .../X86/avx512-intrinsics-fast-isel.ll        |   25 +-
 llvm/test/CodeGen/X86/avx512-load-store.ll    |   32 +-
 llvm/test/CodeGen/X86/avx512-logic.ll         |   14 +-
 llvm/test/CodeGen/X86/avx512-select.ll        |    4 +-
 .../X86/avx512-shuffles/partial_permute.ll    |  802 ++---
 .../X86/avx512-shuffles/shuffle-interleave.ll |   31 +-
 .../CodeGen/X86/avx512-shuffles/unpack.ll     |   40 +-
 llvm/test/CodeGen/X86/avx512fp16-mov.ll       |    8 +-
 llvm/test/CodeGen/X86/bitreverse.ll           |   10 +-
 llvm/test/CodeGen/X86/buildvec-insertvec.ll   |   16 +-
 llvm/test/CodeGen/X86/combine-fabs.ll         |    9 +-
 llvm/test/CodeGen/X86/combine-sdiv.ll         |   12 +-
 llvm/test/CodeGen/X86/combine-udiv.ll         |    8 +-
 llvm/test/CodeGen/X86/commute-blend-avx2.ll   |    6 +-
 llvm/test/CodeGen/X86/commute-blend-sse41.ll  |   12 +-
 .../X86/copysign-constant-magnitude.ll        |   24 +-
 llvm/test/CodeGen/X86/extract-concat.ll       |   10 +-
 llvm/test/CodeGen/X86/extractelement-fp.ll    |    7 +-
 llvm/test/CodeGen/X86/extractelement-load.ll  |   59 +-
 llvm/test/CodeGen/X86/fabs.ll                 |    3 +-
 llvm/test/CodeGen/X86/fast-isel-fneg.ll       |    5 +-
 llvm/test/CodeGen/X86/fma-signed-zero.ll      |    6 +-
 llvm/test/CodeGen/X86/fp-fold.ll              |   27 +-
 llvm/test/CodeGen/X86/fp-intrinsics-fma.ll    |   42 +-
 llvm/test/CodeGen/X86/fp-logic.ll             |   12 +-
 llvm/test/CodeGen/X86/fp-round.ll             |   57 +-
 llvm/test/CodeGen/X86/fp128-cast.ll           |    3 +-
 llvm/test/CodeGen/X86/fp16-libcalls.ll        |   12 +-
 llvm/test/CodeGen/X86/freeze-vector.ll        |    8 +-
 llvm/test/CodeGen/X86/gfni-funnel-shifts.ll   |   62 +-
 llvm/test/CodeGen/X86/half.ll                 |   14 +-
 .../X86/insert-into-constant-vector.ll        |   52 +-
 llvm/test/CodeGen/X86/insertps-combine.ll     |   12 +-
 .../CodeGen/X86/insertps-from-constantpool.ll |    6 +-
 .../CodeGen/X86/insertps-unfold-load-bug.ll   |    4 +-
 llvm/test/CodeGen/X86/is_fpclass.ll           |   10 +-
 .../X86/isel-blendi-gettargetconstant.ll      |    7 +-
 llvm/test/CodeGen/X86/load-partial.ll         |    4 -
 llvm/test/CodeGen/X86/masked_load.ll          |    3 +-
 llvm/test/CodeGen/X86/masked_store.ll         |   15 +-
 llvm/test/CodeGen/X86/mmx-arith.ll            |   11 +-
 llvm/test/CodeGen/X86/neg_fp.ll               |    5 +-
 llvm/test/CodeGen/X86/negative-sin.ll         |    3 +-
 llvm/test/CodeGen/X86/packus.ll               |   60 +-
 llvm/test/CodeGen/X86/peephole-fold-movsd.ll  |    2 +-
 llvm/test/CodeGen/X86/pr14161.ll              |    3 +-
 llvm/test/CodeGen/X86/pr30511.ll              |    5 +-
 llvm/test/CodeGen/X86/pr31956.ll              |    9 +-
 llvm/test/CodeGen/X86/pr34592.ll              |   36 +-
 llvm/test/CodeGen/X86/pr36553.ll              |    3 +-
 llvm/test/CodeGen/X86/pr40811.ll              |    9 +-
 llvm/test/CodeGen/X86/pr63091.ll              |    7 +-
 llvm/test/CodeGen/X86/sar_fold64.ll           |   16 +-
 llvm/test/CodeGen/X86/setcc-combine.ll        |    6 +-
 .../test/CodeGen/X86/setcc-non-simple-type.ll |   26 +-
 llvm/test/CodeGen/X86/shrink_vmul.ll          |   28 +-
 llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll |   24 +-
 llvm/test/CodeGen/X86/splat-for-size.ll       |    4 +-
 llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll   |    3 +-
 .../CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll |    6 +-
 llvm/test/CodeGen/X86/sqrt-fastmath.ll        |   10 +-
 .../CodeGen/X86/srem-seteq-vec-nonsplat.ll    |  138 +-
 llvm/test/CodeGen/X86/sse-align-12.ll         |    4 +-
 llvm/test/CodeGen/X86/sse2.ll                 |   85 +-
 llvm/test/CodeGen/X86/sse3.ll                 |   26 +-
 llvm/test/CodeGen/X86/sse41.ll                |  184 +-
 llvm/test/CodeGen/X86/strict-fsub-combines.ll |   28 +-
 llvm/test/CodeGen/X86/subvector-broadcast.ll  |    4 +-
 llvm/test/CodeGen/X86/test-shrink-bug.ll      |    7 +-
 .../X86/tuning-shuffle-unpckpd-avx512.ll      |  189 +-
 .../CodeGen/X86/tuning-shuffle-unpckpd.ll     |   52 +-
 .../X86/urem-seteq-vec-tautological.ll        |    6 +-
 llvm/test/CodeGen/X86/vec_insert-5.ll         |   59 +-
 llvm/test/CodeGen/X86/vec_int_to_fp.ll        |  120 +-
 llvm/test/CodeGen/X86/vec_shift5.ll           |   28 +-
 llvm/test/CodeGen/X86/vector-bitreverse.ll    |    5 +-
 .../vector-constrained-fp-intrinsics-flags.ll |   28 +-
 .../X86/vector-constrained-fp-intrinsics.ll   |   21 +-
 llvm/test/CodeGen/X86/vector-fshl-256.ll      |   44 +-
 llvm/test/CodeGen/X86/vector-fshl-512.ll      |   29 +-
 llvm/test/CodeGen/X86/vector-fshr-256.ll      |    4 +
 llvm/test/CodeGen/X86/vector-fshr-512.ll      |   29 +-
 .../CodeGen/X86/vector-reduce-fmax-nnan.ll    |   28 +-
 llvm/test/CodeGen/X86/vector-reduce-fmin.ll   |    4 +-
 llvm/test/CodeGen/X86/vector-rotate-128.ll    |   30 +-
 llvm/test/CodeGen/X86/vector-rotate-256.ll    |    4 +-
 .../test/CodeGen/X86/vector-shift-ashr-128.ll |   54 +-
 .../test/CodeGen/X86/vector-shift-ashr-256.ll |   31 +-
 .../test/CodeGen/X86/vector-shift-ashr-512.ll |    3 +-
 .../test/CodeGen/X86/vector-shift-lshr-128.ll |   40 +-
 .../test/CodeGen/X86/vector-shift-lshr-256.ll |   31 +-
 .../test/CodeGen/X86/vector-shift-lshr-512.ll |    3 +-
 llvm/test/CodeGen/X86/vector-shift-shl-128.ll |   40 +-
 llvm/test/CodeGen/X86/vector-shift-shl-256.ll |   33 +-
 llvm/test/CodeGen/X86/vector-shift-shl-512.ll |    3 +-
 .../test/CodeGen/X86/vector-shuffle-128-v2.ll |   30 +-
 .../test/CodeGen/X86/vector-shuffle-128-v4.ll |   71 +-
 .../X86/vector-shuffle-combining-avx2.ll      |   19 +-
 .../X86/vector-shuffle-combining-ssse3.ll     |   11 +-
 .../CodeGen/X86/vector-shuffle-combining.ll   |  132 +-
 llvm/test/CodeGen/X86/vector-shuffle-v1.ll    |    6 +-
 llvm/test/CodeGen/X86/vector-shuffle-v192.ll  |   16 +-
 llvm/test/CodeGen/X86/vector-shuffle-v48.ll   |    6 +-
 llvm/test/CodeGen/X86/vselect.ll              |   30 +-
 llvm/test/CodeGen/X86/widened-broadcast.ll    |   62 +-
 .../CodeGen/X86/x86-interleaved-access.ll     |   35 +-
 llvm/test/CodeGen/X86/xop-shifts.ll           |    7 +-
 llvm/test/CodeGen/X86/xor.ll                  |   14 +-
 157 files changed, 4077 insertions(+), 3875 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5140b40166b99e..4807565abc03d7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3479,6 +3479,37 @@ bool TargetLowering::SimplifyDemandedVectorElts(
 
     break;
   }
+  case ISD::LOAD: {
+    auto *Ld = cast<LoadSDNode>(Op);
+    if (!ISD::isNormalLoad(Ld) || !Ld->isSimple())
+      break;
+
+    // TODO: Handle arbitrary vector extract for isMask
+    if (DemandedElts.popcount() != 1)
+      break;
+
+    EVT VT = Ld->getValueType(0);
+    if (TLO.LegalOperations() &&
+        !isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+      break;
+
+    EVT EltVT = VT.getVectorElementType();
+    SDLoc DL(Ld);
+
+    unsigned Idx = DemandedElts.countTrailingZeros();
+
+    SDValue IdxVal = TLO.DAG.getVectorIdxConstant(Idx, DL);
+    SDValue Scalarized =
+        scalarizeExtractedVectorLoad(EltVT, DL, VT, IdxVal, Ld, TLO.DAG);
+    if (!Scalarized)
+      break;
+
+    TLO.DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Scalarized.getValue(1));
+
+    SDValue Insert = TLO.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
+                                     TLO.DAG.getUNDEF(VT), Scalarized, IdxVal);
+    return TLO.CombineTo(Op, Insert);
+  }
   case ISD::VECTOR_SHUFFLE: {
     SDValue LHS = Op.getOperand(0);
     SDValue RHS = Op.getOperand(1);
diff --git a/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll b/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
index f5aa4c666a5681..e9a4a83a406838 100644
--- a/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
@@ -30,7 +30,7 @@ define void @test_i64_v2f32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev64 v{{[0-9]+}}.2s
 ; CHECK: str
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to i64
     %4 = add i64 %3, %3
@@ -43,7 +43,7 @@ define void @test_i64_v2i32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev64 v{{[0-9]+}}.2s
 ; CHECK: str
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to i64
     %4 = add i64 %3, %3
@@ -121,7 +121,7 @@ define void @test_f64_v2f32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev64 v{{[0-9]+}}.2s
 ; CHECK: str
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to double
     %4 = fadd double %3, %3
@@ -134,7 +134,7 @@ define void @test_f64_v2i32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev64 v{{[0-9]+}}.2s
 ; CHECK: str
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to double
     %4 = fadd double %3, %3
@@ -213,7 +213,7 @@ define void @test_v1i64_v2f32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev64 v{{[0-9]+}}.2s
 ; CHECK: str
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to <1 x i64>
     %4 = add <1 x i64> %3, %3
@@ -226,7 +226,7 @@ define void @test_v1i64_v2i32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev64 v{{[0-9]+}}.2s
 ; CHECK: str
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to <1 x i64>
     %4 = add <1 x i64> %3, %3
@@ -318,7 +318,7 @@ define void @test_v2f32_v1i64(ptr %p, ptr %q) {
 define void @test_v2f32_v2i32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to <2 x float>
     %4 = fadd <2 x float> %3, %3
@@ -410,7 +410,7 @@ define void @test_v2i32_v1i64(ptr %p, ptr %q) {
 define void @test_v2i32_v2f32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to <2 x i32>
     %4 = add <2 x i32> %3, %3
@@ -488,7 +488,7 @@ define void @test_v4i16_v2f32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev32 v{{[0-9]+}}.4h
 ; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to <4 x i16>
     %4 = add <4 x i16> %3, %3
@@ -501,7 +501,7 @@ define void @test_v4i16_v2i32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev32 v{{[0-9]+}}.4h
 ; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to <4 x i16>
     %4 = add <4 x i16> %3, %3
@@ -587,7 +587,7 @@ define void @test_v4f16_v2f32(ptr %p, ptr %q) {
 ; CHECK: fadd
 ; CHECK-NOT: rev
 ; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to <4 x half>
     %4 = fadd <4 x half> %3, %3
@@ -602,7 +602,7 @@ define void @test_v4f16_v2i32(ptr %p, ptr %q) {
 ; CHECK: fadd
 ; CHECK-NOT: rev
 ; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to <4 x half>
     %4 = fadd <4 x half> %3, %3
@@ -682,7 +682,7 @@ define void @test_v8i8_v2f32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev32 v{{[0-9]+}}.8b
 ; CHECK: st1 { v{{[0-9]+}}.8b }
-    %1 = load <2 x float>, ptr %p
+    %1 = load volatile <2 x float>, ptr %p
     %2 = fadd <2 x float> %1, %1
     %3 = bitcast <2 x float> %2 to <8 x i8>
     %4 = add <8 x i8> %3, %3
@@ -695,7 +695,7 @@ define void @test_v8i8_v2i32(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2s }
 ; CHECK: rev32 v{{[0-9]+}}.8b
 ; CHECK: st1 { v{{[0-9]+}}.8b }
-    %1 = load <2 x i32>, ptr %p
+    %1 = load volatile <2 x i32>, ptr %p
     %2 = add <2 x i32> %1, %1
     %3 = bitcast <2 x i32> %2 to <8 x i8>
     %4 = add <8 x i8> %3, %3
@@ -721,7 +721,7 @@ define void @test_f128_v2f64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: ext
 ; CHECK: str
-    %1 = load <2 x double>, ptr %p
+    %1 = load volatile <2 x double>, ptr %p
     %2 = fadd <2 x double> %1, %1
     %3 = bitcast <2 x double> %2 to fp128
     %4 = fadd fp128 %3, %3
@@ -734,7 +734,7 @@ define void @test_f128_v2i64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: ext
 ; CHECK: str
-    %1 = load <2 x i64>, ptr %p
+    %1 = load volatile <2 x i64>, ptr %p
     %2 = add <2 x i64> %1, %1
     %3 = bitcast <2 x i64> %2 to fp128
     %4 = fadd fp128 %3, %3
@@ -816,7 +816,7 @@ define void @test_v2f64_f128(ptr %p, ptr %q) {
 define void @test_v2f64_v2i64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <2 x i64>, ptr %p
+    %1 = load volatile <2 x i64>, ptr %p
     %2 = add <2 x i64> %1, %1
     %3 = bitcast <2 x i64> %2 to <2 x double>
     %4 = fadd <2 x double> %3, %3
@@ -895,7 +895,7 @@ define void @test_v2i64_f128(ptr %p, ptr %q) {
 define void @test_v2i64_v2f64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <2 x double>, ptr %p
+    %1 = load volatile <2 x double>, ptr %p
     %2 = fadd <2 x double> %1, %1
     %3 = bitcast <2 x double> %2 to <2 x i64>
     %4 = add <2 x i64> %3, %3
@@ -979,7 +979,7 @@ define void @test_v4f32_v2f64(ptr %p, ptr %q) {
 ; CHECK: rev64 v{{[0-9]+}}.4s
 ; CHECK-NOT: rev
 ; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <2 x double>, ptr %p
+    %1 = load volatile <2 x double>, ptr %p
     %2 = fadd <2 x double> %1, %1
     %3 = bitcast <2 x double> %2 to <4 x float>
     %4 = fadd <4 x float> %3, %3
@@ -994,7 +994,7 @@ define void @test_v4f32_v2i64(ptr %p, ptr %q) {
 ; CHECK: fadd
 ; CHECK-NOT: rev
 ; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <2 x i64>, ptr %p
+    %1 = load volatile <2 x i64>, ptr %p
     %2 = add <2 x i64> %1, %1
     %3 = bitcast <2 x i64> %2 to <4 x float>
     %4 = fadd <4 x float> %3, %3
@@ -1062,7 +1062,7 @@ define void @test_v4i32_v2f64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: rev64 v{{[0-9]+}}.4s
 ; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <2 x double>, ptr %p
+    %1 = load volatile <2 x double>, ptr %p
     %2 = fadd <2 x double> %1, %1
     %3 = bitcast <2 x double> %2 to <4 x i32>
     %4 = add <4 x i32> %3, %3
@@ -1075,7 +1075,7 @@ define void @test_v4i32_v2i64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: rev64 v{{[0-9]+}}.4s
 ; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <2 x i64>, ptr %p
+    %1 = load volatile <2 x i64>, ptr %p
     %2 = add <2 x i64> %1, %1
     %3 = bitcast <2 x i64> %2 to <4 x i32>
     %4 = add <4 x i32> %3, %3
@@ -1141,7 +1141,7 @@ define void @test_v8i16_v2f64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: rev64 v{{[0-9]+}}.8h
 ; CHECK: st1 { v{{[0-9]+}}.8h }
-    %1 = load <2 x double>, ptr %p
+    %1 = load volatile <2 x double>, ptr %p
     %2 = fadd <2 x double> %1, %1
     %3 = bitcast <2 x double> %2 to <8 x i16>
     %4 = add <8 x i16> %3, %3
@@ -1154,7 +1154,7 @@ define void @test_v8i16_v2i64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: rev64 v{{[0-9]+}}.8h
 ; CHECK: st1 { v{{[0-9]+}}.8h }
-    %1 = load <2 x i64>, ptr %p
+    %1 = load volatile <2 x i64>, ptr %p
     %2 = add <2 x i64> %1, %1
     %3 = bitcast <2 x i64> %2 to <8 x i16>
     %4 = add <8 x i16> %3, %3
@@ -1234,7 +1234,7 @@ define void @test_v16i8_v2f64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: rev64 v{{[0-9]+}}.16b
 ; CHECK: st1 { v{{[0-9]+}}.16b }
-    %1 = load <2 x double>, ptr %p
+    %1 = load volatile <2 x double>, ptr %p
     %2 = fadd <2 x double> %1, %1
     %3 = bitcast <2 x double> %2 to <16 x i8>
     %4 = add <16 x i8> %3, %3
@@ -1247,7 +1247,7 @@ define void @test_v16i8_v2i64(ptr %p, ptr %q) {
 ; CHECK: ld1 { v{{[0-9]+}}.2d }
 ; CHECK: rev64 v{{[0-9]+}}.16b
 ; CHECK: st1 { v{{[0-9]+}}.16b }
-    %1 = load <2 x i64>, ptr %p
+    %1 = load volatile <2 x i64>, ptr %p
     %2 = add <2 x i64> %1, %1
     %3 = bitcast <2 x i64> %2 to <16 x i8>
     %4 = add <16 x i8> %3, %3
@@ -1315,7 +1315,7 @@ define %struct.struct1 @test_v4f16_struct(ptr %ret) {
 entry:
 ; CHECK: ld1 { {{v[0-9]+}}.4h }
 ; CHECK-NOT: rev
-  %0 = load <4 x half>, ptr %ret, align 2
+  %0 = load volatile <4 x half>, ptr %ret, align 2
   %1 = extractelement <4 x half> %0, i32 0
   %.fca.0.insert = insertvalue %struct.struct1 undef, half %1, 0
   ret %struct.struct1 %.fca.0.insert
diff --git a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
index d76e817e62a495..ce657aa1f0b5bc 100644
--- a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
+++ b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
@@ -27,10 +27,7 @@
 define i64 @g(ptr %p) {
 ; CHECK-LABEL: g:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    add x9, x8, x8
-; CHECK-NEXT:    add x8, x9, x8
-; CHECK-NEXT:    sub x0, x8, x8
+; CHECK-NEXT:    mov x0, xzr
 ; CHECK-NEXT:    ret
   %vec = load <2 x i64>, ptr %p, align 1
   %elt = extractelement <2 x i64> %vec, i32 1
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 66f26fc9d85973..d39e537edb7861 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -679,28 +679,27 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 160
 ; CHECK-SD-NEXT:    .cfi_offset w30, -16
 ; CHECK-SD-NEXT:    stp q2, q5, [sp, #112] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    add x8, sp, #176
 ; CHECK-SD-NEXT:    // kill: def $d6 killed $d6 def $q6
 ; CHECK-SD-NEXT:    // kill: def $d7 killed $d7 def $q7
-; CHECK-SD-NEXT:    ldr d5, [sp, #184]
-; CHECK-SD-NEXT:    str q3, [sp, #64] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    ldp d3, d2, [sp, #168]
+; CHECK-SD-NEXT:    str q3, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldp d3, d2, [sp, #160]
 ; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
 ; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-SD-NEXT:    mov v0.16b, v1.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v4.16b
-; CHECK-SD-NEXT:    str q5, [sp, #96] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    ldr d5, [sp, #160]
-; CHECK-SD-NEXT:    mov v3.d[1], v2.d[0]
-; CHECK-SD-NEXT:    str q5, [sp, #80] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp q6, q3, [sp, #32] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    ld1 { v2.d }[1], [x8]
+; CHECK-SD-NEXT:    stp q6, q3, [sp, #80] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    str q2, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr d2, [sp, #184]
+; CHECK-SD-NEXT:    str q2, [sp, #64] // 16-byte Folded Spill
 ; CHECK-SD-NEXT:    bl __lttf2
 ; CHECK-SD-NEXT:    cmp w0, #0
-; CHECK-SD-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    cset w8, lt
 ; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
 ; CHECK-SD-NEXT:    fmov d0, x8
 ; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp q0, q1, [sp, #16] // 32-byte Folded Reload
 ; CHECK-SD-NEXT:    bl __lttf2
 ; CHECK-SD-NEXT:    cmp w0, #0
 ; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -708,19 +707,19 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
 ; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
 ; CHECK-SD-NEXT:    fmov d1, x8
 ; CHECK-SD-NEXT:    mov v1.d[1], v0.d[0]
-; CHECK-SD-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
 ; CHECK-SD-NEXT:    ldp q0, q1, [sp, #112] // 32-byte Folded Reload
 ; CHECK-SD-NEXT:    bl __lttf2
-; CHECK-SD-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldp q0, q3, [sp, #80] // 32-byte Folded Reload
 ; CHECK-SD-NEXT:    cmp w0, #0
-; CHECK-SD-NEXT:    ldp q2, q4, [sp, #64] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldp q2, q1, [sp, #32] // 32-byte Folded Reload
 ; CHECK-SD-NEXT:    cset w8, lt
 ; CHECK-SD-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-SD-NEXT:    ldr q3, [sp, #96] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr q4, [sp, #64] // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
-; CHECK-SD-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-SD-NEXT:    fmov d2, x8
-; CHECK-SD-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT:    bsl v2.16b, v3.16b, v4.16b
 ; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
@@ -815,20 +814,20 @@ define <3 x double> @v3f64_double(<3 x double> %a, <3 x double> %b, <3 x double>
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-SD-NEXT:    // kill: def $d6 killed $d6 def $q6
 ; CHECK-SD-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-SD-NEXT:    add x8, sp, #16
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-SD-NEXT:    // kill: def $d5 killed $d5 def $q5
-; CHECK-SD-NEXT:    ldr d16, [sp, #24]
-; CHECK-SD-NEXT:    ldr d17, [sp]
 ; CHECK-SD-NEXT:    mov v3.d[1], v4.d[0]
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-SD-NEXT:    ldp d1, d4, [sp, #8]
 ; CHECK-SD-NEXT:    fcmgt v2.2d, v5.2d, v2.2d
-; CHECK-SD-NEXT:    mov v1.d[1], v4.d[0]
 ; CHECK-SD-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
-; CHECK-SD-NEXT:    bsl v2.16b, v17.16b, v16.16b
-; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    ldp d3, d1, [sp]
+; CHECK-SD-NEXT:    ld1 { v1.d }[1], [x8]
 ; CHECK-SD-NEXT:    bsl v0.16b, v6.16b, v1.16b
+; CHECK-SD-NEXT:    ldr d1, [sp, #24]
+; CHECK-SD-NEXT:    bsl v2.16b, v3.16b, v1.16b
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
diff --git a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
index 31ead890ba8ac7..ed22243eeef45f 100644
--- a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
+++ b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
@@ -45,11 +45,11 @@ define void @loop(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K,
 ; CHECK-NEXT:    mov w8, w3
 ; CHECK-NEXT:  .LBB1_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q2, [x1], #2
+; CHECK-NEXT:    ldr q2, [x2], #2
 ; CHECK-NEXT:    subs x8, x8, #1
-; CHECK-NEXT:    ldr q3, [x2], #2
-; CHECK-NEXT:    fmlal v0.4s, v3.4h, v2.h[0]
-; CHECK-NEXT:    fmlal2 v1.4s, v3.4h, v2.h[0]
+; CHECK-NEXT:    ld1r { v3.8h }, [x1], #2
+; CHECK-NEXT:    fmlal v0.4s, v2.4h, v3.4h
+; CHECK-NEXT:    fmlal2 v1.4s, v2.4h, v3.4h
 ; CHECK-NEXT:    b.ne .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
 ; CHECK-NEXT:    stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index e284795760c5ca..f586647439d255 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1123,30 +1123,29 @@ entry:
 define <3 x i64> @v3i64_i64(<3 x i64> %a, <3 x i64> %b, <3 x i64> %d, <3 x i64> %e) {
 ; CHECK-SD-LABEL: v3i64_i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d4 killed $d4 def $q4
 ; CHECK-SD-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-SD-NEXT:    // kill: def $d6 killed $d6 def $q6
 ; CHECK-SD-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-SD-NEXT:    add x8, sp, #16
 ; CHECK-SD-NEXT:    // kill: def $d5 killed $d5 def $q5
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT:    ldr d16, [sp, #24]
-; CHECK-SD-NEXT:    ldr d17, [sp]
 ; CHECK-SD-NEXT:    mov v3.d[1], v4.d[0]
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-SD-NEXT:    ldp d1, d4, [sp, #8]
-; CHECK-SD-NEXT:    mov v1.d[1], v4.d[0]
+; CHECK-SD-NEXT:    ldp d4, d1, [sp]
+; CHECK-SD-NEXT:    ld1 { v1.d }[1], [x8]
 ; CHECK-SD-NEXT:    cmgt v0.2d, v3.2d, v0.2d
 ; CHECK-SD-NEXT:    bsl v0.16b, v6.16b, v1.16b
 ; CHECK-SD-NEXT:    cmgt v1.2d, v5.2d, v2.2d
-; CHECK-SD-NEXT:    mov v2.16b, v1.16b
+; CHECK-SD-NEXT:    ldr d2, [sp, #24]
+; CHECK-SD-NEXT:    bit v2.16b, v4.16b, v1.16b
 ; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-SD-NEXT:    bsl v2.16b, v17.16b, v16.16b
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: v3i64_i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
index ad4efeaf39247a..1e6427c4cd4956 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
@@ -33,10 +33,7 @@ define half @extractelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
 define half @extractelement_v16f16(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.h, z0.h[15]
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ldr h0, [x0, #30]
 ; CHECK-NEXT:    ret
     %op1 = load <16 x half>, ptr %a
     %r = extractelement <16 x half> %op1, i64 15
@@ -44,22 +41,10 @@ define half @extractelement_v16f16(ptr %a) vscale_range(2,0) #0 {
 }
 
 define half @extractelement_v32f16(ptr %a) #0 {
-; VBITS_GE_256-LABEL: extractelement_v32f16:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    mov z0.h, z0.h[15]
-; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 killed $z0
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: extractelement_v32f16:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    mov z0.h, z0.h[31]
-; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 killed $z0
-; VBITS_GE_512-NEXT:    ret
+; CHECK-LABEL: extractelement_v32f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, #62]
+; CHECK-NEXT:    ret
     %op1 = load <32 x half>, ptr %a
     %r = extractelement <32 x half> %op1, i64 31
     ret half %r
@@ -68,11 +53,7 @@ define half @extractelement_v32f16(ptr %a) #0 {
 define half @extractelement_v64f16(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extractelement_v64f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    mov w8, #63 // =0x3f
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.h, xzr, x8
-; CHECK-NEXT:    lastb h0, p0, z0.h
+; CHECK-NEXT:    ldr h0, [x0, #126]
 ; CHECK-NEXT:    ret
     %op1 = load <64 x half>, ptr %a
     %r = extractelement <64 x half> %op1, i64 63
@@ -82,11 +63,7 @@ define half @extractelement_v64f16(ptr %a) vscale_range(8,0) #0 {
 define half @extractelement_v128f16(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extractelement_v128f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl128
-; CHECK-NEXT:    mov w8, #127 // =0x7f
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.h, xzr, x8
-; CHECK-NEXT:    lastb h0, p0, z0.h
+; CHECK-NEXT:    ldr h0, [x0, #254]
 ; CHECK-NEXT:    ret
     %op1 = load <128 x half>, ptr %a
     %r = extractelement <128 x half> %op1, i64 127
@@ -117,10 +94,7 @@ define float @extractelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
 define float @extractelement_v8f32(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.s, z0.s[7]
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ldr s0, [x0, #28]
 ; CHECK-NEXT:    ret
     %op1 = load <8 x float>, ptr %a
     %r = extractelement <8 x float> %op1, i64 7
@@ -128,22 +102,10 @@ define float @extractelement_v8f32(ptr %a) vscale_range(2,0) #0 {
 }
 
 define float @extractelement_v16f32(ptr %a) #0 {
-; VBITS_GE_256-LABEL: extractelement_v16f32:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov z0.s, z0.s[7]
-; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 killed $z0
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: extractelement_v16f32:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    mov z0.s, z0.s[15]
-; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 killed $z0
-; VBITS_GE_512-NEXT:    ret
+; CHECK-LABEL: extractelement_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0, #60]
+; CHECK-NEXT:    ret
     %op1 = load <16 x float>, ptr %a
     %r = extractelement <16 x float> %op1, i64 15
     ret float %r
@@ -152,11 +114,7 @@ define float @extractelement_v16f32(ptr %a) #0 {
 define float @extractelement_v32f32(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extractelement_v32f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    mov w8, #31 // =0x1f
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.s, xzr, x8
-; CHECK-NEXT:    lastb s0, p0, z0.s
+; CHECK-NEXT:    ldr s0, [x0, #124]
 ; CHECK-NEXT:    ret
     %op1 = load <32 x float>, ptr %a
     %r = extractelement <32 x float> %op1, i64 31
@@ -166,11 +124,7 @@ define float @extractelement_v32f32(ptr %a) vscale_range(8,0) #0 {
 define float @extractelement_v64f32(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extractelement_v64f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl64
-; CHECK-NEXT:    mov w8, #63 // =0x3f
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.s, xzr, x8
-; CHECK-NEXT:    lastb s0, p0, z0.s
+; CHECK-NEXT:    ldr s0, [x0, #252]
 ; CHECK-NEXT:    ret
     %op1 = load <64 x float>, ptr %a
     %r = extractelement <64 x float> %op1, i64 63
@@ -199,10 +153,7 @@ define double @extractelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 {
 define double @extractelement_v4f64(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extractelement_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z0.d, z0.d[3]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ldr d0, [x0, #24]
 ; CHECK-NEXT:    ret
     %op1 = load <4 x double>, ptr %a
     %r = extractelement <4 x double> %op1, i64 3
@@ -210,22 +161,10 @@ define double @extractelement_v4f64(ptr %a) vscale_range(2,0) #0 {
 }
 
 define double @extractelement_v8f64(ptr %a) #0 {
-; VBITS_GE_256-LABEL: extractelement_v8f64:
-; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    mov z0.d, z0.d[3]
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; VBITS_GE_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: extractelement_v8f64:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    mov z0.d, z0.d[7]
-; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; VBITS_GE_512-NEXT:    ret
+; CHECK-LABEL: extractelement_v8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, #56]
+; CHECK-NEXT:    ret
     %op1 = load <8 x double>, ptr %a
     %r = extractelement <8 x double> %op1, i64 7
     ret double %r
@@ -234,11 +173,7 @@ define double @extractelement_v8f64(ptr %a) #0 {
 define double @extractelement_v16f64(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extractelement_v16f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    mov w8, #15 // =0xf
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.d, xzr, x8
-; CHECK-NEXT:    lastb d0, p0, z0.d
+; CHECK-NEXT:    ldr d0, [x0, #120]
 ; CHECK-NEXT:    ret
     %op1 = load <16 x double>, ptr %a
     %r = extractelement <16 x double> %op1, i64 15
@@ -248,11 +183,7 @@ define double @extractelement_v16f64(ptr %a) vscale_range(8,0) #0 {
 define double @extractelement_v32f64(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extractelement_v32f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl32
-; CHECK-NEXT:    mov w8, #31 // =0x1f
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.d, xzr, x8
-; CHECK-NEXT:    lastb d0, p0, z0.d
+; CHECK-NEXT:    ldr d0, [x0, #248]
 ; CHECK-NEXT:    ret
     %op1 = load <32 x double>, ptr %a
     %r = extractelement <32 x double> %op1, i64 31
@@ -260,3 +191,6 @@ define double @extractelement_v32f64(ptr %a) vscale_range(16,0) #0 {
 }
 
 attributes #0 = { "target-features"="+sve" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; VBITS_GE_256: {{.*}}
+; VBITS_GE_512: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 27e95489f8ad7a..5233d292c4eaf4 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -438,8 +438,7 @@ define void @masked_gather_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    // implicit-def: $d0
 ; CHECK-NEXT:    cbnz x8, .LBB15_2
 ; CHECK-NEXT:  // %bb.1: // %cond.load
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index e3e06dcdf17f30..5af3a88c711bd1 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -415,13 +415,13 @@ define void @masked_scatter_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_scatter_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    cbnz x8, .LBB15_2
 ; CHECK-NEXT:  // %bb.1: // %cond.store
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    str d0, [x8]
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    str d0, [x9]
 ; CHECK-NEXT:  .LBB15_2: // %else
 ; CHECK-NEXT:    ret
   %vals = load <1 x i64>, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
index cf308e6c4395ff..f0e5fa6e03090f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
@@ -71,18 +71,12 @@ define half @extractelement_v8f16(<8 x half> %op1) {
 define half @extractelement_v16f16(ptr %a) {
 ; CHECK-LABEL: extractelement_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    mov z0.h, z0.h[7]
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ldr h0, [x0, #30]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v16f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldr h0, [x0, #30]
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %r = extractelement <16 x half> %op1, i64 15
@@ -131,18 +125,12 @@ define float @extractelement_v4f32(<4 x float> %op1) {
 define float @extractelement_v8f32(ptr %a) {
 ; CHECK-LABEL: extractelement_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    mov z0.s, z0.s[3]
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ldr s0, [x0, #28]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v8f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldr s0, [x0, #28]
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %r = extractelement <8 x float> %op1, i64 7
@@ -182,18 +170,12 @@ define double @extractelement_v2f64(<2 x double> %op1) {
 define double @extractelement_v4f64(ptr %a) {
 ; CHECK-LABEL: extractelement_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ldr d0, [x0, #24]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: extractelement_v4f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
-; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ldr d0, [x0, #24]
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %r = extractelement <4 x double> %op1, i64 3
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index fab45c9dc3bc3c..6ffe8c5fab29f8 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -880,44 +880,43 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_brev_b32 s4, -2
+; SI-NEXT:    s_load_dword s4, s[4:5], 0xe
+; SI-NEXT:    s_brev_b32 s5, -2
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_bfi_b32 v0, s5, v0, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_test_copysign_f32_fptrunc_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x38
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_brev_b32 s4, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_brev_b32 s0, -2
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_bfi_b32 v2, s0, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_bfi_b32 v2, s4, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x38
+; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
-; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s3, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %sign.trunc = fptrunc double %sign to float
   %result = call float @llvm.copysign.f32(float %mag, float %sign.trunc)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 5f75a2f29a026f..ad126bb22b5831 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -13,14 +13,14 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x1d
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_brev_b32 s4, -2
+; SI-NEXT:    s_load_dword s4, s[4:5], 0x1e
+; SI-NEXT:    s_brev_b32 s5, -2
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_bfi_b32 v1, s4, v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_bfi_b32 v1, s5, v0, v1
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -28,32 +28,32 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
 ; VI-LABEL: s_test_copysign_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x74
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x78
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_brev_b32 s4, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_brev_b32 s2, -2
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_bfi_b32 v1, s2, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_bfi_b32 v1, s4, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_copysign_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x74
-; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x4c
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x78
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s3, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
index 6c921441c972d3..ea2fe9d6208938 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
@@ -19,31 +19,30 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; FORWARDXNACK-LABEL: shuffle_v4f16_234u:
 ; FORWARDXNACK:       ; %bb.0:
 ; FORWARDXNACK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FORWARDXNACK-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; FORWARDXNACK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; FORWARDXNACK-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; FORWARDXNACK-NEXT:    global_load_dword v5, v[2:3], off
 ; FORWARDXNACK-NEXT:    s_waitcnt vmcnt(1)
-; FORWARDXNACK-NEXT:    v_mov_b32_e32 v0, v6
+; FORWARDXNACK-NEXT:    v_mov_b32_e32 v0, v4
 ; FORWARDXNACK-NEXT:    s_waitcnt vmcnt(0)
-; FORWARDXNACK-NEXT:    v_mov_b32_e32 v1, v4
+; FORWARDXNACK-NEXT:    v_mov_b32_e32 v1, v5
 ; FORWARDXNACK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; REVERSEXNACK-LABEL: shuffle_v4f16_234u:
 ; REVERSEXNACK:       ; %bb.0:
 ; REVERSEXNACK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; REVERSEXNACK-NEXT:    v_mov_b32_e32 v6, v1
-; REVERSEXNACK-NEXT:    v_mov_b32_e32 v5, v0
-; REVERSEXNACK-NEXT:    v_mov_b32_e32 v4, v3
-; REVERSEXNACK-NEXT:    v_mov_b32_e32 v3, v2
-; REVERSEXNACK-NEXT:    global_load_dword v0, v[5:6], off offset:4
-; REVERSEXNACK-NEXT:    global_load_dwordx2 v[1:2], v[3:4], off
+; REVERSEXNACK-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; REVERSEXNACK-NEXT:    global_load_dword v4, v[2:3], off
+; REVERSEXNACK-NEXT:    s_waitcnt vmcnt(1)
+; REVERSEXNACK-NEXT:    v_mov_b32_e32 v0, v5
 ; REVERSEXNACK-NEXT:    s_waitcnt vmcnt(0)
+; REVERSEXNACK-NEXT:    v_mov_b32_e32 v1, v4
 ; REVERSEXNACK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; NOXNACK-LABEL: shuffle_v4f16_234u:
 ; NOXNACK:       ; %bb.0:
 ; NOXNACK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOXNACK-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; NOXNACK-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; NOXNACK-NEXT:    global_load_dword v1, v[2:3], off
 ; NOXNACK-NEXT:    s_waitcnt vmcnt(0)
 ; NOXNACK-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 5dff660912e402..a656ce2fa9d71f 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -34,7 +34,7 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; CHECK-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
+; CHECK-NEXT:    s_load_dword s22, s[4:5], 0x0
 ; CHECK-NEXT:    s_movk_i32 s20, 0x130
 ; CHECK-NEXT:    s_mov_b32 s21, s24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -55,7 +55,7 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b32 s20, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_writelane_b32 v7, s49, 13
-; CHECK-NEXT:    v_mov_b32_e32 v2, s28
+; CHECK-NEXT:    v_mov_b32_e32 v2, s22
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v1
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
@@ -318,8 +318,8 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
 ; CHECK-NEXT:    ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
 ; CHECK-NEXT:    ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
+; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v1, v4, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 3d27b5fe7f30b3..5b96fb06afbbf4 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -112,10 +112,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V4-LABEL: llvm_amdgcn_is_shared:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x40
-; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT:    s_load_dword s0, s[8:9], 0x4
+; GFX8V4-NEXT:    s_load_dword s1, s[6:7], 0x40
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V4-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX8V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v0
@@ -124,10 +124,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_is_shared:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_load_dword s0, s[8:9], 0xcc
-; GFX8V5-NEXT:    s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT:    s_load_dword s0, s[8:9], 0x4
+; GFX8V5-NEXT:    s_load_dword s1, s[8:9], 0xcc
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V5-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX8V5-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v0
@@ -166,10 +166,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V4-LABEL: llvm_amdgcn_is_private:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x44
-; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT:    s_load_dword s0, s[8:9], 0x4
+; GFX8V4-NEXT:    s_load_dword s1, s[6:7], 0x44
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V4-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX8V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v0
@@ -178,10 +178,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_is_private:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_load_dword s0, s[8:9], 0xc8
-; GFX8V5-NEXT:    s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT:    s_load_dword s0, s[8:9], 0x4
+; GFX8V5-NEXT:    s_load_dword s1, s[8:9], 0xc8
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V5-NEXT:    s_cmp_eq_u32 s0, s1
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX8V5-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v0
diff --git a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
index 363d568f9c11c9..f06175d1adaec3 100644
--- a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
@@ -16,7 +16,7 @@ define amdgpu_ps float @nonuniform_uniform(i32 %arg18) {
 .entry:
   %tmp31 = sext i32 %arg18 to i64
   %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) @indexable, i64 0, i64 %tmp31
-  %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
+  %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16
   %tmp34 = extractelement <3 x float> %tmp33, i32 0
   ret float %tmp34
 }
@@ -31,7 +31,7 @@ define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) {
   %tmp1 = zext i32 %arg18 to i64
   %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
   %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset
-  %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
+  %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16
   %tmp34 = extractelement <3 x float> %tmp33, i32 0
   ret float %tmp34
 }
@@ -46,7 +46,7 @@ define amdgpu_ps float @const_nonuniform(i32 %arg18) {
   %tmp1 = zext i32 %arg18 to i64
   %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
   %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 1
-  %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
+  %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16
   %tmp34 = extractelement <3 x float> %tmp33, i32 0
   ret float %tmp34
 }
@@ -61,7 +61,7 @@ define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) {
   %tmp1 = zext i32 %arg18 to i64
   %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
   %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset
-  %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
+  %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16
   %tmp34 = extractelement <3 x float> %tmp33, i32 0
   ret float %tmp34
 }
diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index db802732e987b5..87a21a46eaff51 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -93,8 +93,8 @@ define amdgpu_kernel void @sgpr_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %a) {
 }
 
 ; GCN-LABEL: {{^}}s_trunc_i64_to_i1:
-; SI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
-; VI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
+; SI: s_load_dword s[[SLO:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dword s[[SLO:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: s_bitcmp1_b32 s[[SLO]], 0
 ; GCN: s_cselect_b32 {{s[0-9]+}}, 63, -12
 define amdgpu_kernel void @s_trunc_i64_to_i1(ptr addrspace(1) %out, [8 x i32], i64 %x) {
diff --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
index b079a94b5fcc3d..fc0740ab2693e9 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
@@ -5,31 +6,31 @@
 define <2 x i8> @shuffle_v2i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_ushort v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b16_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_ushort v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b16 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2i8_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b16 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x i8> %val0, <2 x i8> poison, <2 x i32> <i32 1, i32 1>
@@ -39,37 +40,37 @@ entry:
 define <4 x i8> @shuffle_v4i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4i8_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x i8> %val0, <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -79,49 +80,49 @@ entry:
 define <8 x i8> @shuffle_v8i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8i8_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x i8> %val0, <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -131,73 +132,73 @@ entry:
 define <16 x i8> @shuffle_v16i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16i8_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x i8> %val0, <16 x i8> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -207,121 +208,121 @@ entry:
 define <32 x i8> @shuffle_v32i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  v_mov_b32_e32 v16, v0
-; GFX9-NEXT:  v_mov_b32_e32 v17, v0
-; GFX9-NEXT:  v_mov_b32_e32 v18, v0
-; GFX9-NEXT:  v_mov_b32_e32 v19, v0
-; GFX9-NEXT:  v_mov_b32_e32 v20, v0
-; GFX9-NEXT:  v_mov_b32_e32 v21, v0
-; GFX9-NEXT:  v_mov_b32_e32 v22, v0
-; GFX9-NEXT:  v_mov_b32_e32 v23, v0
-; GFX9-NEXT:  v_mov_b32_e32 v24, v0
-; GFX9-NEXT:  v_mov_b32_e32 v25, v0
-; GFX9-NEXT:  v_mov_b32_e32 v26, v0
-; GFX9-NEXT:  v_mov_b32_e32 v27, v0
-; GFX9-NEXT:  v_mov_b32_e32 v28, v0
-; GFX9-NEXT:  v_mov_b32_e32 v29, v0
-; GFX9-NEXT:  v_mov_b32_e32 v30, v0
-; GFX9-NEXT:  v_mov_b32_e32 v31, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-NEXT:    v_mov_b32_e32 v18, v0
+; GFX9-NEXT:    v_mov_b32_e32 v19, v0
+; GFX9-NEXT:    v_mov_b32_e32 v20, v0
+; GFX9-NEXT:    v_mov_b32_e32 v21, v0
+; GFX9-NEXT:    v_mov_b32_e32 v22, v0
+; GFX9-NEXT:    v_mov_b32_e32 v23, v0
+; GFX9-NEXT:    v_mov_b32_e32 v24, v0
+; GFX9-NEXT:    v_mov_b32_e32 v25, v0
+; GFX9-NEXT:    v_mov_b32_e32 v26, v0
+; GFX9-NEXT:    v_mov_b32_e32 v27, v0
+; GFX9-NEXT:    v_mov_b32_e32 v28, v0
+; GFX9-NEXT:    v_mov_b32_e32 v29, v0
+; GFX9-NEXT:    v_mov_b32_e32 v30, v0
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  v_mov_b32_e32 v16, v0
-; GFX10-NEXT:  v_mov_b32_e32 v17, v0
-; GFX10-NEXT:  v_mov_b32_e32 v18, v0
-; GFX10-NEXT:  v_mov_b32_e32 v19, v0
-; GFX10-NEXT:  v_mov_b32_e32 v20, v0
-; GFX10-NEXT:  v_mov_b32_e32 v21, v0
-; GFX10-NEXT:  v_mov_b32_e32 v22, v0
-; GFX10-NEXT:  v_mov_b32_e32 v23, v0
-; GFX10-NEXT:  v_mov_b32_e32 v24, v0
-; GFX10-NEXT:  v_mov_b32_e32 v25, v0
-; GFX10-NEXT:  v_mov_b32_e32 v26, v0
-; GFX10-NEXT:  v_mov_b32_e32 v27, v0
-; GFX10-NEXT:  v_mov_b32_e32 v28, v0
-; GFX10-NEXT:  v_mov_b32_e32 v29, v0
-; GFX10-NEXT:  v_mov_b32_e32 v30, v0
-; GFX10-NEXT:  v_mov_b32_e32 v31, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    v_mov_b32_e32 v16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v17, v0
+; GFX10-NEXT:    v_mov_b32_e32 v18, v0
+; GFX10-NEXT:    v_mov_b32_e32 v19, v0
+; GFX10-NEXT:    v_mov_b32_e32 v20, v0
+; GFX10-NEXT:    v_mov_b32_e32 v21, v0
+; GFX10-NEXT:    v_mov_b32_e32 v22, v0
+; GFX10-NEXT:    v_mov_b32_e32 v23, v0
+; GFX10-NEXT:    v_mov_b32_e32 v24, v0
+; GFX10-NEXT:    v_mov_b32_e32 v25, v0
+; GFX10-NEXT:    v_mov_b32_e32 v26, v0
+; GFX10-NEXT:    v_mov_b32_e32 v27, v0
+; GFX10-NEXT:    v_mov_b32_e32 v28, v0
+; GFX10-NEXT:    v_mov_b32_e32 v29, v0
+; GFX10-NEXT:    v_mov_b32_e32 v30, v0
+; GFX10-NEXT:    v_mov_b32_e32 v31, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32i8_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  v_mov_b32_e32 v16, v0
-; GFX11-NEXT:  v_mov_b32_e32 v17, v0
-; GFX11-NEXT:  v_mov_b32_e32 v18, v0
-; GFX11-NEXT:  v_mov_b32_e32 v19, v0
-; GFX11-NEXT:  v_mov_b32_e32 v20, v0
-; GFX11-NEXT:  v_mov_b32_e32 v21, v0
-; GFX11-NEXT:  v_mov_b32_e32 v22, v0
-; GFX11-NEXT:  v_mov_b32_e32 v23, v0
-; GFX11-NEXT:  v_mov_b32_e32 v24, v0
-; GFX11-NEXT:  v_mov_b32_e32 v25, v0
-; GFX11-NEXT:  v_mov_b32_e32 v26, v0
-; GFX11-NEXT:  v_mov_b32_e32 v27, v0
-; GFX11-NEXT:  v_mov_b32_e32 v28, v0
-; GFX11-NEXT:  v_mov_b32_e32 v29, v0
-; GFX11-NEXT:  v_mov_b32_e32 v30, v0
-; GFX11-NEXT:  v_mov_b32_e32 v31, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    v_mov_b32_e32 v16, v0
+; GFX11-NEXT:    v_mov_b32_e32 v17, v0
+; GFX11-NEXT:    v_mov_b32_e32 v18, v0
+; GFX11-NEXT:    v_mov_b32_e32 v19, v0
+; GFX11-NEXT:    v_mov_b32_e32 v20, v0
+; GFX11-NEXT:    v_mov_b32_e32 v21, v0
+; GFX11-NEXT:    v_mov_b32_e32 v22, v0
+; GFX11-NEXT:    v_mov_b32_e32 v23, v0
+; GFX11-NEXT:    v_mov_b32_e32 v24, v0
+; GFX11-NEXT:    v_mov_b32_e32 v25, v0
+; GFX11-NEXT:    v_mov_b32_e32 v26, v0
+; GFX11-NEXT:    v_mov_b32_e32 v27, v0
+; GFX11-NEXT:    v_mov_b32_e32 v28, v0
+; GFX11-NEXT:    v_mov_b32_e32 v29, v0
+; GFX11-NEXT:    v_mov_b32_e32 v30, v0
+; GFX11-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x i8> %val0, <32 x i8> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -331,28 +332,28 @@ entry:
 define <2 x i16> @shuffle_v2i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2i16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x i16> %val0, <2 x i16> poison, <2 x i32> <i32 1, i32 1>
@@ -362,32 +363,32 @@ entry:
 define <4 x i16> @shuffle_v4i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4i16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x i16> %val0, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -397,38 +398,38 @@ entry:
 define <8 x i16> @shuffle_v8i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8i16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x i16> %val0, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -438,50 +439,50 @@ entry:
 define <16 x i16> @shuffle_v16i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16i16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x i16> %val0, <16 x i16> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -491,74 +492,74 @@ entry:
 define <32 x i16> @shuffle_v32i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32i16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x i16> %val0, <32 x i16> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -568,27 +569,27 @@ entry:
 define <2 x i32> @shuffle_v2i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x i32> %val0, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
@@ -598,33 +599,33 @@ entry:
 define <4 x i32> @shuffle_v4i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -634,45 +635,45 @@ entry:
 define <8 x i32> @shuffle_v8i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x i32> %val0, <8 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -682,69 +683,69 @@ entry:
 define <16 x i32> @shuffle_v16i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x i32> %val0, <16 x i32> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -754,117 +755,117 @@ entry:
 define <32 x i32> @shuffle_v32i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  v_mov_b32_e32 v16, v0
-; GFX9-NEXT:  v_mov_b32_e32 v17, v0
-; GFX9-NEXT:  v_mov_b32_e32 v18, v0
-; GFX9-NEXT:  v_mov_b32_e32 v19, v0
-; GFX9-NEXT:  v_mov_b32_e32 v20, v0
-; GFX9-NEXT:  v_mov_b32_e32 v21, v0
-; GFX9-NEXT:  v_mov_b32_e32 v22, v0
-; GFX9-NEXT:  v_mov_b32_e32 v23, v0
-; GFX9-NEXT:  v_mov_b32_e32 v24, v0
-; GFX9-NEXT:  v_mov_b32_e32 v25, v0
-; GFX9-NEXT:  v_mov_b32_e32 v26, v0
-; GFX9-NEXT:  v_mov_b32_e32 v27, v0
-; GFX9-NEXT:  v_mov_b32_e32 v28, v0
-; GFX9-NEXT:  v_mov_b32_e32 v29, v0
-; GFX9-NEXT:  v_mov_b32_e32 v30, v0
-; GFX9-NEXT:  v_mov_b32_e32 v31, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-NEXT:    v_mov_b32_e32 v18, v0
+; GFX9-NEXT:    v_mov_b32_e32 v19, v0
+; GFX9-NEXT:    v_mov_b32_e32 v20, v0
+; GFX9-NEXT:    v_mov_b32_e32 v21, v0
+; GFX9-NEXT:    v_mov_b32_e32 v22, v0
+; GFX9-NEXT:    v_mov_b32_e32 v23, v0
+; GFX9-NEXT:    v_mov_b32_e32 v24, v0
+; GFX9-NEXT:    v_mov_b32_e32 v25, v0
+; GFX9-NEXT:    v_mov_b32_e32 v26, v0
+; GFX9-NEXT:    v_mov_b32_e32 v27, v0
+; GFX9-NEXT:    v_mov_b32_e32 v28, v0
+; GFX9-NEXT:    v_mov_b32_e32 v29, v0
+; GFX9-NEXT:    v_mov_b32_e32 v30, v0
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  v_mov_b32_e32 v16, v0
-; GFX10-NEXT:  v_mov_b32_e32 v17, v0
-; GFX10-NEXT:  v_mov_b32_e32 v18, v0
-; GFX10-NEXT:  v_mov_b32_e32 v19, v0
-; GFX10-NEXT:  v_mov_b32_e32 v20, v0
-; GFX10-NEXT:  v_mov_b32_e32 v21, v0
-; GFX10-NEXT:  v_mov_b32_e32 v22, v0
-; GFX10-NEXT:  v_mov_b32_e32 v23, v0
-; GFX10-NEXT:  v_mov_b32_e32 v24, v0
-; GFX10-NEXT:  v_mov_b32_e32 v25, v0
-; GFX10-NEXT:  v_mov_b32_e32 v26, v0
-; GFX10-NEXT:  v_mov_b32_e32 v27, v0
-; GFX10-NEXT:  v_mov_b32_e32 v28, v0
-; GFX10-NEXT:  v_mov_b32_e32 v29, v0
-; GFX10-NEXT:  v_mov_b32_e32 v30, v0
-; GFX10-NEXT:  v_mov_b32_e32 v31, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    v_mov_b32_e32 v16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v17, v0
+; GFX10-NEXT:    v_mov_b32_e32 v18, v0
+; GFX10-NEXT:    v_mov_b32_e32 v19, v0
+; GFX10-NEXT:    v_mov_b32_e32 v20, v0
+; GFX10-NEXT:    v_mov_b32_e32 v21, v0
+; GFX10-NEXT:    v_mov_b32_e32 v22, v0
+; GFX10-NEXT:    v_mov_b32_e32 v23, v0
+; GFX10-NEXT:    v_mov_b32_e32 v24, v0
+; GFX10-NEXT:    v_mov_b32_e32 v25, v0
+; GFX10-NEXT:    v_mov_b32_e32 v26, v0
+; GFX10-NEXT:    v_mov_b32_e32 v27, v0
+; GFX10-NEXT:    v_mov_b32_e32 v28, v0
+; GFX10-NEXT:    v_mov_b32_e32 v29, v0
+; GFX10-NEXT:    v_mov_b32_e32 v30, v0
+; GFX10-NEXT:    v_mov_b32_e32 v31, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  v_mov_b32_e32 v16, v0
-; GFX11-NEXT:  v_mov_b32_e32 v17, v0
-; GFX11-NEXT:  v_mov_b32_e32 v18, v0
-; GFX11-NEXT:  v_mov_b32_e32 v19, v0
-; GFX11-NEXT:  v_mov_b32_e32 v20, v0
-; GFX11-NEXT:  v_mov_b32_e32 v21, v0
-; GFX11-NEXT:  v_mov_b32_e32 v22, v0
-; GFX11-NEXT:  v_mov_b32_e32 v23, v0
-; GFX11-NEXT:  v_mov_b32_e32 v24, v0
-; GFX11-NEXT:  v_mov_b32_e32 v25, v0
-; GFX11-NEXT:  v_mov_b32_e32 v26, v0
-; GFX11-NEXT:  v_mov_b32_e32 v27, v0
-; GFX11-NEXT:  v_mov_b32_e32 v28, v0
-; GFX11-NEXT:  v_mov_b32_e32 v29, v0
-; GFX11-NEXT:  v_mov_b32_e32 v30, v0
-; GFX11-NEXT:  v_mov_b32_e32 v31, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    v_mov_b32_e32 v16, v0
+; GFX11-NEXT:    v_mov_b32_e32 v17, v0
+; GFX11-NEXT:    v_mov_b32_e32 v18, v0
+; GFX11-NEXT:    v_mov_b32_e32 v19, v0
+; GFX11-NEXT:    v_mov_b32_e32 v20, v0
+; GFX11-NEXT:    v_mov_b32_e32 v21, v0
+; GFX11-NEXT:    v_mov_b32_e32 v22, v0
+; GFX11-NEXT:    v_mov_b32_e32 v23, v0
+; GFX11-NEXT:    v_mov_b32_e32 v24, v0
+; GFX11-NEXT:    v_mov_b32_e32 v25, v0
+; GFX11-NEXT:    v_mov_b32_e32 v26, v0
+; GFX11-NEXT:    v_mov_b32_e32 v27, v0
+; GFX11-NEXT:    v_mov_b32_e32 v28, v0
+; GFX11-NEXT:    v_mov_b32_e32 v29, v0
+; GFX11-NEXT:    v_mov_b32_e32 v30, v0
+; GFX11-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x i32> %val0, <32 x i32> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -874,28 +875,28 @@ entry:
 define <2 x bfloat> @shuffle_v2bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x bfloat> %val0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
@@ -905,31 +906,31 @@ entry:
 define <3 x bfloat> @shuffle_v3bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v3bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v1, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v1, v1, s4
-; GFX9-NEXT:  v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v1, v1, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v1, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
-; GFX10-NEXT:  v_alignbit_b32 v1, s4, v1, 16
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v3bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v1, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
-; GFX11-NEXT:  v_alignbit_b32 v1, s0, v1, 16
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX11-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <3 x bfloat> %val0, <3 x bfloat> poison, <3 x i32> <i32 1, i32 1, i32 1>
@@ -939,32 +940,32 @@ entry:
 define <4 x bfloat> @shuffle_v4bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x bfloat> %val0, <4 x bfloat> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -974,35 +975,35 @@ entry:
 define <6 x bfloat> @shuffle_v6bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v6bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v6bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v6bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <6 x bfloat> %val0, <6 x bfloat> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1012,38 +1013,38 @@ entry:
 define <8 x bfloat> @shuffle_v8bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x bfloat> %val0, <8 x bfloat> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1053,50 +1054,50 @@ entry:
 define <16 x bfloat> @shuffle_v16bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x bfloat> %val0, <16 x bfloat> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1106,74 +1107,74 @@ entry:
 define <32 x bfloat> @shuffle_v32bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32bf16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x bfloat> %val0, <32 x bfloat> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1183,28 +1184,28 @@ entry:
 define <2 x half> @shuffle_v2f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x half> %val0, <2 x half> poison, <2 x i32> <i32 1, i32 1>
@@ -1214,31 +1215,31 @@ entry:
 define <3 x half> @shuffle_v3f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v3f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v1, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v1, v1, s4
-; GFX9-NEXT:  v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v1, v1, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v1, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
-; GFX10-NEXT:  v_alignbit_b32 v1, s4, v1, 16
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v3f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v1, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
-; GFX11-NEXT:  v_alignbit_b32 v1, s0, v1, 16
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX11-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <3 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <3 x half> %val0, <3 x half> poison, <3 x i32> <i32 1, i32 1, i32 1>
@@ -1248,32 +1249,32 @@ entry:
 define <4 x half> @shuffle_v4f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -1283,35 +1284,35 @@ entry:
 define <6 x half> @shuffle_v6f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v6f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v6f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v6f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <6 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <6 x half> %val0, <6 x half> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1321,38 +1322,38 @@ entry:
 define <8 x half> @shuffle_v8f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x half> %val0, <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1362,50 +1363,50 @@ entry:
 define <16 x half> @shuffle_v16f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x half> %val0, <16 x half> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1415,74 +1416,74 @@ entry:
 define <32 x half> @shuffle_v32f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32f16_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x half> %val0, <32 x half> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1492,27 +1493,27 @@ entry:
 define <2 x float> @shuffle_v2f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x float> %val0, <2 x float> poison, <2 x i32> <i32 1, i32 1>
@@ -1522,30 +1523,30 @@ entry:
 define <3 x float> @shuffle_v3f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v3f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx3 v[0:2], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v3f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b96 v[0:2], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <3 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <3 x float> %val0, <3 x float> poison, <3 x i32> <i32 1, i32 1, i32 1>
@@ -1555,33 +1556,33 @@ entry:
 define <4 x float> @shuffle_v4f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x float> %val0, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -1591,39 +1592,39 @@ entry:
 define <6 x float> @shuffle_v6f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v6f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  v_mov_b32_e32 v4, v1
-; GFX9-NEXT:  v_mov_b32_e32 v5, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v6f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  v_mov_b32_e32 v4, v1
-; GFX10-NEXT:  v_mov_b32_e32 v5, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v6f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  v_mov_b32_e32 v4, v1
-; GFX11-NEXT:  v_mov_b32_e32 v5, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <6 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <6 x float> %val0, <6 x float> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1633,45 +1634,45 @@ entry:
 define <8 x float> @shuffle_v8f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  v_mov_b32_e32 v4, v1
-; GFX9-NEXT:  v_mov_b32_e32 v5, v1
-; GFX9-NEXT:  v_mov_b32_e32 v6, v1
-; GFX9-NEXT:  v_mov_b32_e32 v7, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  v_mov_b32_e32 v4, v1
-; GFX10-NEXT:  v_mov_b32_e32 v5, v1
-; GFX10-NEXT:  v_mov_b32_e32 v6, v1
-; GFX10-NEXT:  v_mov_b32_e32 v7, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  v_mov_b32_e32 v4, v1
-; GFX11-NEXT:  v_mov_b32_e32 v5, v1
-; GFX11-NEXT:  v_mov_b32_e32 v6, v1
-; GFX11-NEXT:  v_mov_b32_e32 v7, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    v_mov_b32_e32 v7, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x float> %val0, <8 x float> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1681,69 +1682,69 @@ entry:
 define <16 x float> @shuffle_v16f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  v_mov_b32_e32 v4, v1
-; GFX9-NEXT:  v_mov_b32_e32 v5, v1
-; GFX9-NEXT:  v_mov_b32_e32 v6, v1
-; GFX9-NEXT:  v_mov_b32_e32 v7, v1
-; GFX9-NEXT:  v_mov_b32_e32 v8, v1
-; GFX9-NEXT:  v_mov_b32_e32 v9, v1
-; GFX9-NEXT:  v_mov_b32_e32 v10, v1
-; GFX9-NEXT:  v_mov_b32_e32 v11, v1
-; GFX9-NEXT:  v_mov_b32_e32 v12, v1
-; GFX9-NEXT:  v_mov_b32_e32 v13, v1
-; GFX9-NEXT:  v_mov_b32_e32 v14, v1
-; GFX9-NEXT:  v_mov_b32_e32 v15, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-NEXT:    v_mov_b32_e32 v12, v1
+; GFX9-NEXT:    v_mov_b32_e32 v13, v1
+; GFX9-NEXT:    v_mov_b32_e32 v14, v1
+; GFX9-NEXT:    v_mov_b32_e32 v15, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  v_mov_b32_e32 v4, v1
-; GFX10-NEXT:  v_mov_b32_e32 v5, v1
-; GFX10-NEXT:  v_mov_b32_e32 v6, v1
-; GFX10-NEXT:  v_mov_b32_e32 v7, v1
-; GFX10-NEXT:  v_mov_b32_e32 v8, v1
-; GFX10-NEXT:  v_mov_b32_e32 v9, v1
-; GFX10-NEXT:  v_mov_b32_e32 v10, v1
-; GFX10-NEXT:  v_mov_b32_e32 v11, v1
-; GFX10-NEXT:  v_mov_b32_e32 v12, v1
-; GFX10-NEXT:  v_mov_b32_e32 v13, v1
-; GFX10-NEXT:  v_mov_b32_e32 v14, v1
-; GFX10-NEXT:  v_mov_b32_e32 v15, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v11, v1
+; GFX10-NEXT:    v_mov_b32_e32 v12, v1
+; GFX10-NEXT:    v_mov_b32_e32 v13, v1
+; GFX10-NEXT:    v_mov_b32_e32 v14, v1
+; GFX10-NEXT:    v_mov_b32_e32 v15, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  v_mov_b32_e32 v4, v1
-; GFX11-NEXT:  v_mov_b32_e32 v5, v1
-; GFX11-NEXT:  v_mov_b32_e32 v6, v1
-; GFX11-NEXT:  v_mov_b32_e32 v7, v1
-; GFX11-NEXT:  v_mov_b32_e32 v8, v1
-; GFX11-NEXT:  v_mov_b32_e32 v9, v1
-; GFX11-NEXT:  v_mov_b32_e32 v10, v1
-; GFX11-NEXT:  v_mov_b32_e32 v11, v1
-; GFX11-NEXT:  v_mov_b32_e32 v12, v1
-; GFX11-NEXT:  v_mov_b32_e32 v13, v1
-; GFX11-NEXT:  v_mov_b32_e32 v14, v1
-; GFX11-NEXT:  v_mov_b32_e32 v15, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    v_mov_b32_e32 v7, v1
+; GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GFX11-NEXT:    v_mov_b32_e32 v9, v1
+; GFX11-NEXT:    v_mov_b32_e32 v10, v1
+; GFX11-NEXT:    v_mov_b32_e32 v11, v1
+; GFX11-NEXT:    v_mov_b32_e32 v12, v1
+; GFX11-NEXT:    v_mov_b32_e32 v13, v1
+; GFX11-NEXT:    v_mov_b32_e32 v14, v1
+; GFX11-NEXT:    v_mov_b32_e32 v15, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x float> %val0, <16 x float> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1753,117 +1754,117 @@ entry:
 define <32 x float> @shuffle_v32f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  v_mov_b32_e32 v4, v1
-; GFX9-NEXT:  v_mov_b32_e32 v5, v1
-; GFX9-NEXT:  v_mov_b32_e32 v6, v1
-; GFX9-NEXT:  v_mov_b32_e32 v7, v1
-; GFX9-NEXT:  v_mov_b32_e32 v8, v1
-; GFX9-NEXT:  v_mov_b32_e32 v9, v1
-; GFX9-NEXT:  v_mov_b32_e32 v10, v1
-; GFX9-NEXT:  v_mov_b32_e32 v11, v1
-; GFX9-NEXT:  v_mov_b32_e32 v12, v1
-; GFX9-NEXT:  v_mov_b32_e32 v13, v1
-; GFX9-NEXT:  v_mov_b32_e32 v14, v1
-; GFX9-NEXT:  v_mov_b32_e32 v15, v1
-; GFX9-NEXT:  v_mov_b32_e32 v16, v1
-; GFX9-NEXT:  v_mov_b32_e32 v17, v1
-; GFX9-NEXT:  v_mov_b32_e32 v18, v1
-; GFX9-NEXT:  v_mov_b32_e32 v19, v1
-; GFX9-NEXT:  v_mov_b32_e32 v20, v1
-; GFX9-NEXT:  v_mov_b32_e32 v21, v1
-; GFX9-NEXT:  v_mov_b32_e32 v22, v1
-; GFX9-NEXT:  v_mov_b32_e32 v23, v1
-; GFX9-NEXT:  v_mov_b32_e32 v24, v1
-; GFX9-NEXT:  v_mov_b32_e32 v25, v1
-; GFX9-NEXT:  v_mov_b32_e32 v26, v1
-; GFX9-NEXT:  v_mov_b32_e32 v27, v1
-; GFX9-NEXT:  v_mov_b32_e32 v28, v1
-; GFX9-NEXT:  v_mov_b32_e32 v29, v1
-; GFX9-NEXT:  v_mov_b32_e32 v30, v1
-; GFX9-NEXT:  v_mov_b32_e32 v31, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-NEXT:    v_mov_b32_e32 v12, v1
+; GFX9-NEXT:    v_mov_b32_e32 v13, v1
+; GFX9-NEXT:    v_mov_b32_e32 v14, v1
+; GFX9-NEXT:    v_mov_b32_e32 v15, v1
+; GFX9-NEXT:    v_mov_b32_e32 v16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-NEXT:    v_mov_b32_e32 v18, v1
+; GFX9-NEXT:    v_mov_b32_e32 v19, v1
+; GFX9-NEXT:    v_mov_b32_e32 v20, v1
+; GFX9-NEXT:    v_mov_b32_e32 v21, v1
+; GFX9-NEXT:    v_mov_b32_e32 v22, v1
+; GFX9-NEXT:    v_mov_b32_e32 v23, v1
+; GFX9-NEXT:    v_mov_b32_e32 v24, v1
+; GFX9-NEXT:    v_mov_b32_e32 v25, v1
+; GFX9-NEXT:    v_mov_b32_e32 v26, v1
+; GFX9-NEXT:    v_mov_b32_e32 v27, v1
+; GFX9-NEXT:    v_mov_b32_e32 v28, v1
+; GFX9-NEXT:    v_mov_b32_e32 v29, v1
+; GFX9-NEXT:    v_mov_b32_e32 v30, v1
+; GFX9-NEXT:    v_mov_b32_e32 v31, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  v_mov_b32_e32 v4, v1
-; GFX10-NEXT:  v_mov_b32_e32 v5, v1
-; GFX10-NEXT:  v_mov_b32_e32 v6, v1
-; GFX10-NEXT:  v_mov_b32_e32 v7, v1
-; GFX10-NEXT:  v_mov_b32_e32 v8, v1
-; GFX10-NEXT:  v_mov_b32_e32 v9, v1
-; GFX10-NEXT:  v_mov_b32_e32 v10, v1
-; GFX10-NEXT:  v_mov_b32_e32 v11, v1
-; GFX10-NEXT:  v_mov_b32_e32 v12, v1
-; GFX10-NEXT:  v_mov_b32_e32 v13, v1
-; GFX10-NEXT:  v_mov_b32_e32 v14, v1
-; GFX10-NEXT:  v_mov_b32_e32 v15, v1
-; GFX10-NEXT:  v_mov_b32_e32 v16, v1
-; GFX10-NEXT:  v_mov_b32_e32 v17, v1
-; GFX10-NEXT:  v_mov_b32_e32 v18, v1
-; GFX10-NEXT:  v_mov_b32_e32 v19, v1
-; GFX10-NEXT:  v_mov_b32_e32 v20, v1
-; GFX10-NEXT:  v_mov_b32_e32 v21, v1
-; GFX10-NEXT:  v_mov_b32_e32 v22, v1
-; GFX10-NEXT:  v_mov_b32_e32 v23, v1
-; GFX10-NEXT:  v_mov_b32_e32 v24, v1
-; GFX10-NEXT:  v_mov_b32_e32 v25, v1
-; GFX10-NEXT:  v_mov_b32_e32 v26, v1
-; GFX10-NEXT:  v_mov_b32_e32 v27, v1
-; GFX10-NEXT:  v_mov_b32_e32 v28, v1
-; GFX10-NEXT:  v_mov_b32_e32 v29, v1
-; GFX10-NEXT:  v_mov_b32_e32 v30, v1
-; GFX10-NEXT:  v_mov_b32_e32 v31, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v11, v1
+; GFX10-NEXT:    v_mov_b32_e32 v12, v1
+; GFX10-NEXT:    v_mov_b32_e32 v13, v1
+; GFX10-NEXT:    v_mov_b32_e32 v14, v1
+; GFX10-NEXT:    v_mov_b32_e32 v15, v1
+; GFX10-NEXT:    v_mov_b32_e32 v16, v1
+; GFX10-NEXT:    v_mov_b32_e32 v17, v1
+; GFX10-NEXT:    v_mov_b32_e32 v18, v1
+; GFX10-NEXT:    v_mov_b32_e32 v19, v1
+; GFX10-NEXT:    v_mov_b32_e32 v20, v1
+; GFX10-NEXT:    v_mov_b32_e32 v21, v1
+; GFX10-NEXT:    v_mov_b32_e32 v22, v1
+; GFX10-NEXT:    v_mov_b32_e32 v23, v1
+; GFX10-NEXT:    v_mov_b32_e32 v24, v1
+; GFX10-NEXT:    v_mov_b32_e32 v25, v1
+; GFX10-NEXT:    v_mov_b32_e32 v26, v1
+; GFX10-NEXT:    v_mov_b32_e32 v27, v1
+; GFX10-NEXT:    v_mov_b32_e32 v28, v1
+; GFX10-NEXT:    v_mov_b32_e32 v29, v1
+; GFX10-NEXT:    v_mov_b32_e32 v30, v1
+; GFX10-NEXT:    v_mov_b32_e32 v31, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  v_mov_b32_e32 v4, v1
-; GFX11-NEXT:  v_mov_b32_e32 v5, v1
-; GFX11-NEXT:  v_mov_b32_e32 v6, v1
-; GFX11-NEXT:  v_mov_b32_e32 v7, v1
-; GFX11-NEXT:  v_mov_b32_e32 v8, v1
-; GFX11-NEXT:  v_mov_b32_e32 v9, v1
-; GFX11-NEXT:  v_mov_b32_e32 v10, v1
-; GFX11-NEXT:  v_mov_b32_e32 v11, v1
-; GFX11-NEXT:  v_mov_b32_e32 v12, v1
-; GFX11-NEXT:  v_mov_b32_e32 v13, v1
-; GFX11-NEXT:  v_mov_b32_e32 v14, v1
-; GFX11-NEXT:  v_mov_b32_e32 v15, v1
-; GFX11-NEXT:  v_mov_b32_e32 v16, v1
-; GFX11-NEXT:  v_mov_b32_e32 v17, v1
-; GFX11-NEXT:  v_mov_b32_e32 v18, v1
-; GFX11-NEXT:  v_mov_b32_e32 v19, v1
-; GFX11-NEXT:  v_mov_b32_e32 v20, v1
-; GFX11-NEXT:  v_mov_b32_e32 v21, v1
-; GFX11-NEXT:  v_mov_b32_e32 v22, v1
-; GFX11-NEXT:  v_mov_b32_e32 v23, v1
-; GFX11-NEXT:  v_mov_b32_e32 v24, v1
-; GFX11-NEXT:  v_mov_b32_e32 v25, v1
-; GFX11-NEXT:  v_mov_b32_e32 v26, v1
-; GFX11-NEXT:  v_mov_b32_e32 v27, v1
-; GFX11-NEXT:  v_mov_b32_e32 v28, v1
-; GFX11-NEXT:  v_mov_b32_e32 v29, v1
-; GFX11-NEXT:  v_mov_b32_e32 v30, v1
-; GFX11-NEXT:  v_mov_b32_e32 v31, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    v_mov_b32_e32 v7, v1
+; GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GFX11-NEXT:    v_mov_b32_e32 v9, v1
+; GFX11-NEXT:    v_mov_b32_e32 v10, v1
+; GFX11-NEXT:    v_mov_b32_e32 v11, v1
+; GFX11-NEXT:    v_mov_b32_e32 v12, v1
+; GFX11-NEXT:    v_mov_b32_e32 v13, v1
+; GFX11-NEXT:    v_mov_b32_e32 v14, v1
+; GFX11-NEXT:    v_mov_b32_e32 v15, v1
+; GFX11-NEXT:    v_mov_b32_e32 v16, v1
+; GFX11-NEXT:    v_mov_b32_e32 v17, v1
+; GFX11-NEXT:    v_mov_b32_e32 v18, v1
+; GFX11-NEXT:    v_mov_b32_e32 v19, v1
+; GFX11-NEXT:    v_mov_b32_e32 v20, v1
+; GFX11-NEXT:    v_mov_b32_e32 v21, v1
+; GFX11-NEXT:    v_mov_b32_e32 v22, v1
+; GFX11-NEXT:    v_mov_b32_e32 v23, v1
+; GFX11-NEXT:    v_mov_b32_e32 v24, v1
+; GFX11-NEXT:    v_mov_b32_e32 v25, v1
+; GFX11-NEXT:    v_mov_b32_e32 v26, v1
+; GFX11-NEXT:    v_mov_b32_e32 v27, v1
+; GFX11-NEXT:    v_mov_b32_e32 v28, v1
+; GFX11-NEXT:    v_mov_b32_e32 v29, v1
+; GFX11-NEXT:    v_mov_b32_e32 v30, v1
+; GFX11-NEXT:    v_mov_b32_e32 v31, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x float> %val0, <32 x float> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index b85bd4c6346684..a00ee9ac0d5a9d 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -32,44 +32,33 @@ define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GX900-LABEL: shuffle_v4f16_234u:
-; GX900:       ; %bb.0:
-; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GX900-NEXT:    s_waitcnt vmcnt(1)
-; GX900-NEXT:    v_mov_b32_e32 v0, v6
-; GX900-NEXT:    s_waitcnt vmcnt(0)
-; GX900-NEXT:    v_mov_b32_e32 v1, v4
-; GX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: shuffle_v4f16_234u:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
-; GFX940-NEXT:    s_waitcnt vmcnt(1)
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v6
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: shuffle_v4f16_234u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_234u:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_234u:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
@@ -320,47 +309,43 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GX900-LABEL: shuffle_v4f16_357u:
 ; GX900:       ; %bb.0:
 ; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GX900-NEXT:    s_waitcnt vmcnt(1)
-; GX900-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GX900-NEXT:    s_waitcnt vmcnt(0)
 ; GX900-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: shuffle_v4f16_357u:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX940-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
-; GFX940-NEXT:    s_waitcnt vmcnt(1)
-; GFX940-NEXT:    v_alignbit_b32 v1, s0, v5, 16
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_perm_b32 v0, v4, v6, s0
+; GFX940-NEXT:    v_alignbit_b32 v1, s0, v5, 16
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_357u:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_357u:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, s0, v3, 16
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_alignbit_b32 v1, s0, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1018,34 +1003,31 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX9-LABEL: shuffle_v4f16_3456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_3456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_3456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1057,12 +1039,11 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX9-LABEL: shuffle_v4f16_5634:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_5634:
@@ -1233,7 +1214,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GX900-LABEL: shuffle_v4f16_0000:
 ; GX900:       ; %bb.0:
 ; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GX900-NEXT:    global_load_dword v0, v[0:1], off
 ; GX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GX900-NEXT:    s_waitcnt vmcnt(0)
 ; GX900-NEXT:    v_perm_b32 v0, v0, v0, s4
@@ -1243,7 +1224,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX940-LABEL: shuffle_v4f16_0000:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX940-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_perm_b32 v0, v0, v0, s0
@@ -1253,7 +1234,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-LABEL: shuffle_v4f16_0000:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -1262,7 +1243,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX11-LABEL: shuffle_v4f16_0000:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1905,43 +1886,39 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GX900-LABEL: shuffle_v4f16_0456:
 ; GX900:       ; %bb.0:
 ; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GX900-NEXT:    global_load_dword v6, v[0:1], off
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GX900-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GX900-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GX900-NEXT:    s_waitcnt vmcnt(0)
-; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
-; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GX900-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: shuffle_v4f16_0456:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX940-NEXT:    global_load_dword v6, v[0:1], off
+; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_perm_b32 v0, v6, v4, s0
-; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
+; GFX940-NEXT:    v_perm_b32 v0, v4, v6, s0
+; GFX940-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_0456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
-; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
-; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x5040100
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_0456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
@@ -3177,44 +3154,33 @@ define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1
 }
 
 define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GX900-LABEL: shuffle_v4bf16_234u:
-; GX900:       ; %bb.0:
-; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GX900-NEXT:    s_waitcnt vmcnt(1)
-; GX900-NEXT:    v_mov_b32_e32 v0, v6
-; GX900-NEXT:    s_waitcnt vmcnt(0)
-; GX900-NEXT:    v_mov_b32_e32 v1, v4
-; GX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: shuffle_v4bf16_234u:
-; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
-; GFX940-NEXT:    s_waitcnt vmcnt(1)
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v1, v6
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: shuffle_v4bf16_234u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_234u:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_234u:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3465,47 +3431,43 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GX900-LABEL: shuffle_v4bf16_357u:
 ; GX900:       ; %bb.0:
 ; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GX900-NEXT:    s_mov_b32 s4, 0x7060302
-; GX900-NEXT:    s_waitcnt vmcnt(1)
-; GX900-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GX900-NEXT:    s_waitcnt vmcnt(0)
 ; GX900-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: shuffle_v4bf16_357u:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX940-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
-; GFX940-NEXT:    s_waitcnt vmcnt(1)
-; GFX940-NEXT:    v_alignbit_b32 v1, s0, v5, 16
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_perm_b32 v0, v4, v6, s0
+; GFX940-NEXT:    v_alignbit_b32 v1, s0, v5, 16
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_357u:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_357u:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, s0, v3, 16
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_alignbit_b32 v1, s0, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4163,34 +4125,31 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_3456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_3456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_3456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4202,12 +4161,11 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_5634:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_5634:
@@ -4293,7 +4251,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GX900-LABEL: shuffle_v4bf16_0000:
 ; GX900:       ; %bb.0:
 ; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GX900-NEXT:    global_load_dword v0, v[0:1], off
 ; GX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GX900-NEXT:    s_waitcnt vmcnt(0)
 ; GX900-NEXT:    v_perm_b32 v0, v0, v0, s4
@@ -4303,7 +4261,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX940-LABEL: shuffle_v4bf16_0000:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX940-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_perm_b32 v0, v0, v0, s0
@@ -4313,7 +4271,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-LABEL: shuffle_v4bf16_0000:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -4322,7 +4280,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-LABEL: shuffle_v4bf16_0000:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -5235,43 +5193,39 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GX900-LABEL: shuffle_v4bf16_0456:
 ; GX900:       ; %bb.0:
 ; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GX900-NEXT:    global_load_dword v6, v[0:1], off
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GX900-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GX900-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GX900-NEXT:    s_waitcnt vmcnt(0)
-; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
-; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GX900-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: shuffle_v4bf16_0456:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX940-NEXT:    global_load_dword v6, v[0:1], off
+; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_perm_b32 v0, v6, v4, s0
-; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
+; GFX940-NEXT:    v_perm_b32 v0, v4, v6, s0
+; GFX940-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
-; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
-; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x5040100
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
diff --git a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll
index 8186f6c9b42fba..695b0a796eb432 100644
--- a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll
+++ b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll
@@ -8,9 +8,10 @@
 define i32 @foo(ptr %descs, i32 %num, i32 %cw) local_unnamed_addr #0 {
 ; CHECK-LABEL: foo:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldr d16, [r0, #32]
+; CHECK-NEXT:    add r0, r0, #32
+; CHECK-NEXT:    vld1.32 {d16[1]}, [r0:32]
 ; CHECK-NEXT:    vadd.i32 d16, d16, d16
-; CHECK-NEXT:    vmov.32 r0, d16[0]
+; CHECK-NEXT:    vmov.32 r0, d16[1]
 ; CHECK-NEXT:    bx lr
 entry:
   %wide.vec = load <16 x i32>, ptr %descs, align 4
diff --git a/llvm/test/CodeGen/ARM/vector-promotion.ll b/llvm/test/CodeGen/ARM/vector-promotion.ll
index 344014ad804495..c3889ccfec7dbd 100644
--- a/llvm/test/CodeGen/ARM/vector-promotion.ll
+++ b/llvm/test/CodeGen/ARM/vector-promotion.ll
@@ -44,7 +44,7 @@ define void @unsupportedInstructionForPromotion(ptr %addr1, i32 %in2, ptr %dest)
 
 
 ; IR-BOTH-LABEL: @unsupportedChainInDifferentBBs
-; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load volatile <2 x i32>, ptr %addr1
 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0
 ; IR-BOTH-NEXT: br i1 %bool, label %bb2, label %end
 ; BB2
@@ -58,10 +58,10 @@ define void @unsupportedInstructionForPromotion(ptr %addr1, i32 %in2, ptr %dest)
 ; ASM: bx
 define void @unsupportedChainInDifferentBBs(ptr %addr1, ptr %dest, i1 %bool) {
 bb1:
-  %in1 = load <2 x i32>, ptr %addr1, align 8
+  %in1 = load volatile <2 x i32>, ptr %addr1, align 8
   %extract = extractelement <2 x i32> %in1, i32 0
   br i1 %bool, label %bb2, label %end
-bb2: 
+bb2:
   %out = or i32 %extract, 1
   store i32 %out, ptr %dest, align 4
   br label %end
@@ -150,7 +150,7 @@ define void @udivCase(ptr %addr1, ptr %dest) {
 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
 ;
 ; IR-BOTH-NEXT: store i32 [[RES]], ptr %dest
-; IR-BOTH-NEXT: ret 
+; IR-BOTH-NEXT: ret
 define void @uremCase(ptr %addr1, ptr %dest) {
   %in1 = load <2 x i32>, ptr %addr1, align 8
   %extract = extractelement <2 x i32> %in1, i32 1
@@ -169,7 +169,7 @@ define void @uremCase(ptr %addr1, ptr %dest) {
 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
 ;
 ; IR-BOTH-NEXT: store i32 [[RES]], ptr %dest
-; IR-BOTH-NEXT: ret 
+; IR-BOTH-NEXT: ret
 define void @sdivCase(ptr %addr1, ptr %dest) {
   %in1 = load <2 x i32>, ptr %addr1, align 8
   %extract = extractelement <2 x i32> %in1, i32 1
@@ -188,7 +188,7 @@ define void @sdivCase(ptr %addr1, ptr %dest) {
 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
 ;
 ; IR-BOTH-NEXT: store i32 [[RES]], ptr %dest
-; IR-BOTH-NEXT: ret 
+; IR-BOTH-NEXT: ret
 define void @sremCase(ptr %addr1, ptr %dest) {
   %in1 = load <2 x i32>, ptr %addr1, align 8
   %extract = extractelement <2 x i32> %in1, i32 1
@@ -199,7 +199,7 @@ define void @sremCase(ptr %addr1, ptr %dest) {
 
 ; IR-BOTH-LABEL: @fdivCase
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1
-; Scalar version:  
+; Scalar version:
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fdiv float [[EXTRACT]], 7.0
 ; Vector version:
@@ -209,7 +209,7 @@ define void @sremCase(ptr %addr1, ptr %dest) {
 ; IR-BOTH-NEXT: store float [[RES]], ptr %dest
 ; IR-BOTH-NEXT: ret
 define void @fdivCase(ptr %addr1, ptr %dest) {
-  %in1 = load <2 x float>, ptr %addr1, align 8   
+  %in1 = load <2 x float>, ptr %addr1, align 8
   %extract = extractelement <2 x float> %in1, i32 1
   %out = fdiv float %extract, 7.0
   store float %out, ptr %dest, align 4
@@ -218,7 +218,7 @@ define void @fdivCase(ptr %addr1, ptr %dest) {
 
 ; IR-BOTH-LABEL: @fremCase
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1
-; Scalar version:  
+; Scalar version:
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem float [[EXTRACT]], 7.0
 ; Vector version:
@@ -228,7 +228,7 @@ define void @fdivCase(ptr %addr1, ptr %dest) {
 ; IR-BOTH-NEXT: store float [[RES]], ptr %dest
 ; IR-BOTH-NEXT: ret
 define void @fremCase(ptr %addr1, ptr %dest) {
-  %in1 = load <2 x float>, ptr %addr1, align 8   
+  %in1 = load <2 x float>, ptr %addr1, align 8
   %extract = extractelement <2 x float> %in1, i32 1
   %out = frem float %extract, 7.0
   store float %out, ptr %dest, align 4
@@ -272,7 +272,7 @@ define void @undefRemCase(ptr %addr1, ptr %dest) {
 ; flag is set.
 ; IR-BOTH-LABEL: @undefConstantFRemCaseWithFastMath
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1
-; Scalar version:  
+; Scalar version:
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float [[EXTRACT]], 7.0
 ; Vector version:
@@ -282,7 +282,7 @@ define void @undefRemCase(ptr %addr1, ptr %dest) {
 ; IR-BOTH-NEXT: store float [[RES]], ptr %dest
 ; IR-BOTH-NEXT: ret
 define void @undefConstantFRemCaseWithFastMath(ptr %addr1, ptr %dest) {
-  %in1 = load <2 x float>, ptr %addr1, align 8   
+  %in1 = load <2 x float>, ptr %addr1, align 8
   %extract = extractelement <2 x float> %in1, i32 1
   %out = frem nnan float %extract, 7.0
   store float %out, ptr %dest, align 4
@@ -293,7 +293,7 @@ define void @undefConstantFRemCaseWithFastMath(ptr %addr1, ptr %dest) {
 ; flag is set.
 ; IR-BOTH-LABEL: @undefVectorFRemCaseWithFastMath
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1
-; Scalar version:  
+; Scalar version:
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float 7.000000e+00, [[EXTRACT]]
 ; Vector version:
@@ -303,7 +303,7 @@ define void @undefConstantFRemCaseWithFastMath(ptr %addr1, ptr %dest) {
 ; IR-BOTH-NEXT: store float [[RES]], ptr %dest
 ; IR-BOTH-NEXT: ret
 define void @undefVectorFRemCaseWithFastMath(ptr %addr1, ptr %dest) {
-  %in1 = load <2 x float>, ptr %addr1, align 8   
+  %in1 = load <2 x float>, ptr %addr1, align 8
   %extract = extractelement <2 x float> %in1, i32 1
   %out = frem nnan float 7.0, %extract
   store float %out, ptr %dest, align 4
@@ -315,7 +315,7 @@ define void @undefVectorFRemCaseWithFastMath(ptr %addr1, ptr %dest) {
 ; not promote on armv7.
 ; IR-BOTH-LABEL: @simpleOneInstructionPromotionFloat
 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1
-; Scalar version: 
+; Scalar version:
 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fadd float [[EXTRACT]], 1.0
 ; Vector version:
diff --git a/llvm/test/CodeGen/ARM/vext.ll b/llvm/test/CodeGen/ARM/vext.ll
index 7ddf1d02834c38..06fc0cf4303f48 100644
--- a/llvm/test/CodeGen/ARM/vext.ll
+++ b/llvm/test/CodeGen/ARM/vext.ll
@@ -76,9 +76,10 @@ define <4 x i16> @test_vextd16(ptr %A, ptr %B) nounwind {
 define <4 x i32> @test_vextq32(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: test_vextq32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
-; CHECK-NEXT:    vext.32 q8, q9, q8, #3
+; CHECK-NEXT:    add r0, r0, #12
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
+; CHECK-NEXT:    vld1.32 {d17[1]}, [r0:32]
+; CHECK-NEXT:    vext.32 q8, q8, q9, #3
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    vmov r2, r3, d17
 ; CHECK-NEXT:    mov pc, lr
diff --git a/llvm/test/CodeGen/ARM/vuzp.ll b/llvm/test/CodeGen/ARM/vuzp.ll
index d54446a431ee98..c412189633a9bd 100644
--- a/llvm/test/CodeGen/ARM/vuzp.ll
+++ b/llvm/test/CodeGen/ARM/vuzp.ll
@@ -285,13 +285,13 @@ entry:
 define <4 x i32> @vuzp_lower_shufflemask_zeroed(ptr %A, ptr %B) {
 ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldr d18, [r0]
-; CHECK-NEXT:    vorr d19, d18, d18
-; CHECK-NEXT:    vldr d17, [r1]
-; CHECK-NEXT:    vtrn.32 d19, d17
-; CHECK-NEXT:    vdup.32 d16, d18[0]
-; CHECK-NEXT:    vmov r2, r3, d17
-; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    add r0, r1, #4
+; CHECK-NEXT:    vld1.32 {d19[1]}, [r0:32]
+; CHECK-NEXT:    vdup.32 d18, d16[0]
+; CHECK-NEXT:    vtrn.32 d16, d19
+; CHECK-NEXT:    vmov r0, r1, d18
+; CHECK-NEXT:    vmov r2, r3, d19
 ; CHECK-NEXT:    mov pc, lr
 entry:
   %tmp1 = load <2 x i32>, ptr %A
diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll
index 383e5ef19cebf1..09e2cb89be618a 100644
--- a/llvm/test/CodeGen/Mips/cconv/vector.ll
+++ b/llvm/test/CodeGen/Mips/cconv/vector.ll
@@ -5513,28 +5513,28 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) {
 ; MIPS32R5-NEXT:    jr $ra
 ; MIPS32R5-NEXT:    nop
 ;
-; MIPS64R5-LABEL: mixed_i8:
-; MIPS64R5:       # %bb.0: # %entry
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -48
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 48
-; MIPS64R5-NEXT:    sll $1, $5, 0
-; MIPS64R5-NEXT:    andi $1, $1, 255
-; MIPS64R5-NEXT:    mtc1 $1, $f0
-; MIPS64R5-NEXT:    cvt.s.w $f0, $f0
-; MIPS64R5-NEXT:    swc1 $f0, 36($sp)
-; MIPS64R5-NEXT:    swc1 $f0, 32($sp)
-; MIPS64R5-NEXT:    sd $4, 0($sp)
-; MIPS64R5-NEXT:    ld.w $w0, 0($sp)
-; MIPS64R5-NEXT:    ld.w $w1, 32($sp)
-; MIPS64R5-NEXT:    fadd.w $w0, $w1, $w0
-; MIPS64R5-NEXT:    sd $6, 16($sp)
-; MIPS64R5-NEXT:    ld.w $w1, 16($sp)
-; MIPS64R5-NEXT:    fadd.w $w0, $w0, $w1
-; MIPS64R5-NEXT:    splati.w $w1, $w0[1]
-; MIPS64R5-NEXT:    add.s $f0, $f0, $f1
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 48
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
+; MIPS64R5EB-LABEL: mixed_i8:
+; MIPS64R5EB:       # %bb.0: # %entry
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -48
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 48
+; MIPS64R5EB-NEXT:    sll $1, $5, 0
+; MIPS64R5EB-NEXT:    andi $1, $1, 255
+; MIPS64R5EB-NEXT:    mtc1 $1, $f0
+; MIPS64R5EB-NEXT:    cvt.s.w $f0, $f0
+; MIPS64R5EB-NEXT:    swc1 $f0, 36($sp)
+; MIPS64R5EB-NEXT:    swc1 $f0, 32($sp)
+; MIPS64R5EB-NEXT:    insert.d $w0[0], $4
+; MIPS64R5EB-NEXT:    shf.w $w0, $w0, 177
+; MIPS64R5EB-NEXT:    ld.w $w1, 32($sp)
+; MIPS64R5EB-NEXT:    fadd.w $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    insert.d $w1[0], $6
+; MIPS64R5EB-NEXT:    shf.w $w1, $w1, 177
+; MIPS64R5EB-NEXT:    fadd.w $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    splati.w $w1, $w0[1]
+; MIPS64R5EB-NEXT:    add.s $f0, $f0, $f1
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 48
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
 ;
 ; MIPS64EL-LABEL: mixed_i8:
 ; MIPS64EL:       # %bb.0: # %entry
@@ -5559,6 +5559,27 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) {
 ; MIPS64EL-NEXT:    add.s $f0, $f1, $f0
 ; MIPS64EL-NEXT:    jr $ra
 ; MIPS64EL-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: mixed_i8:
+; MIPS64R5EL:       # %bb.0: # %entry
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -48
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 48
+; MIPS64R5EL-NEXT:    sll $1, $5, 0
+; MIPS64R5EL-NEXT:    andi $1, $1, 255
+; MIPS64R5EL-NEXT:    mtc1 $1, $f0
+; MIPS64R5EL-NEXT:    cvt.s.w $f0, $f0
+; MIPS64R5EL-NEXT:    swc1 $f0, 36($sp)
+; MIPS64R5EL-NEXT:    swc1 $f0, 32($sp)
+; MIPS64R5EL-NEXT:    insert.d $w0[0], $4
+; MIPS64R5EL-NEXT:    ld.w $w1, 32($sp)
+; MIPS64R5EL-NEXT:    fadd.w $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    insert.d $w1[0], $6
+; MIPS64R5EL-NEXT:    fadd.w $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    splati.w $w1, $w0[1]
+; MIPS64R5EL-NEXT:    add.s $f0, $f0, $f1
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 48
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
 entry:
   %0 = zext i8 %b to i32
   %1 = uitofp i32 %0 to float
diff --git a/llvm/test/CodeGen/Mips/msa/basic_operations.ll b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
index 4fc3f57aa002df..7c8c31c0ec181f 100644
--- a/llvm/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
@@ -1014,34 +1014,54 @@ define i32 @extract_sext_v4i32() nounwind {
 ; O32:       # %bb.0:
 ; O32-NEXT:    lui $2, %hi(_gp_disp)
 ; O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; O32-NEXT:    addiu $sp, $sp, -32
+; O32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; O32-NEXT:    sw $fp, 24($sp) # 4-byte Folded Spill
+; O32-NEXT:    move $fp, $sp
+; O32-NEXT:    addiu $1, $zero, -16
+; O32-NEXT:    and $sp, $sp, $1
 ; O32-NEXT:    addu $1, $2, $25
 ; O32-NEXT:    lw $1, %got(v4i32)($1)
-; O32-NEXT:    ld.w $w0, 0($1)
+; O32-NEXT:    lw $1, 4($1)
+; O32-NEXT:    sw $1, 4($sp)
+; O32-NEXT:    ld.w $w0, 0($sp)
 ; O32-NEXT:    addv.w $w0, $w0, $w0
-; O32-NEXT:    jr $ra
 ; O32-NEXT:    copy_s.w $2, $w0[1]
+; O32-NEXT:    move $sp, $fp
+; O32-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
+; O32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; O32-NEXT:    jr $ra
+; O32-NEXT:    addiu $sp, $sp, 32
 ;
 ; N32-LABEL: extract_sext_v4i32:
 ; N32:       # %bb.0:
+; N32-NEXT:    addiu $sp, $sp, -16
 ; N32-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_sext_v4i32)))
 ; N32-NEXT:    addu $1, $1, $25
 ; N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(extract_sext_v4i32)))
 ; N32-NEXT:    lw $1, %got_disp(v4i32)($1)
-; N32-NEXT:    ld.w $w0, 0($1)
+; N32-NEXT:    lw $1, 4($1)
+; N32-NEXT:    sw $1, 4($sp)
+; N32-NEXT:    ld.w $w0, 0($sp)
 ; N32-NEXT:    addv.w $w0, $w0, $w0
-; N32-NEXT:    jr $ra
 ; N32-NEXT:    copy_s.w $2, $w0[1]
+; N32-NEXT:    jr $ra
+; N32-NEXT:    addiu $sp, $sp, 16
 ;
 ; N64-LABEL: extract_sext_v4i32:
 ; N64:       # %bb.0:
+; N64-NEXT:    daddiu $sp, $sp, -16
 ; N64-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_sext_v4i32)))
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(extract_sext_v4i32)))
 ; N64-NEXT:    ld $1, %got_disp(v4i32)($1)
-; N64-NEXT:    ld.w $w0, 0($1)
+; N64-NEXT:    lw $1, 4($1)
+; N64-NEXT:    sw $1, 4($sp)
+; N64-NEXT:    ld.w $w0, 0($sp)
 ; N64-NEXT:    addv.w $w0, $w0, $w0
-; N64-NEXT:    jr $ra
 ; N64-NEXT:    copy_s.w $2, $w0[1]
+; N64-NEXT:    jr $ra
+; N64-NEXT:    daddiu $sp, $sp, 16
   %1 = load <4 x i32>, ptr @v4i32
   %2 = add <4 x i32> %1, %1
   %3 = extractelement <4 x i32> %2, i32 1
@@ -1076,25 +1096,33 @@ define i64 @extract_sext_v2i64() nounwind {
 ;
 ; N32-LABEL: extract_sext_v2i64:
 ; N32:       # %bb.0:
+; N32-NEXT:    addiu $sp, $sp, -16
 ; N32-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_sext_v2i64)))
 ; N32-NEXT:    addu $1, $1, $25
 ; N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(extract_sext_v2i64)))
 ; N32-NEXT:    lw $1, %got_disp(v2i64)($1)
-; N32-NEXT:    ld.d $w0, 0($1)
+; N32-NEXT:    ld $1, 8($1)
+; N32-NEXT:    sd $1, 8($sp)
+; N32-NEXT:    ld.d $w0, 0($sp)
 ; N32-NEXT:    addv.d $w0, $w0, $w0
-; N32-NEXT:    jr $ra
 ; N32-NEXT:    copy_s.d $2, $w0[1]
+; N32-NEXT:    jr $ra
+; N32-NEXT:    addiu $sp, $sp, 16
 ;
 ; N64-LABEL: extract_sext_v2i64:
 ; N64:       # %bb.0:
+; N64-NEXT:    daddiu $sp, $sp, -16
 ; N64-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_sext_v2i64)))
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(extract_sext_v2i64)))
 ; N64-NEXT:    ld $1, %got_disp(v2i64)($1)
-; N64-NEXT:    ld.d $w0, 0($1)
+; N64-NEXT:    ld $1, 8($1)
+; N64-NEXT:    sd $1, 8($sp)
+; N64-NEXT:    ld.d $w0, 0($sp)
 ; N64-NEXT:    addv.d $w0, $w0, $w0
-; N64-NEXT:    jr $ra
 ; N64-NEXT:    copy_s.d $2, $w0[1]
+; N64-NEXT:    jr $ra
+; N64-NEXT:    daddiu $sp, $sp, 16
   %1 = load <2 x i64>, ptr @v2i64
   %2 = add <2 x i64> %1, %1
   %3 = extractelement <2 x i64> %2, i32 1
@@ -1186,34 +1214,54 @@ define i32 @extract_zext_v4i32() nounwind {
 ; O32:       # %bb.0:
 ; O32-NEXT:    lui $2, %hi(_gp_disp)
 ; O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; O32-NEXT:    addiu $sp, $sp, -32
+; O32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; O32-NEXT:    sw $fp, 24($sp) # 4-byte Folded Spill
+; O32-NEXT:    move $fp, $sp
+; O32-NEXT:    addiu $1, $zero, -16
+; O32-NEXT:    and $sp, $sp, $1
 ; O32-NEXT:    addu $1, $2, $25
 ; O32-NEXT:    lw $1, %got(v4i32)($1)
-; O32-NEXT:    ld.w $w0, 0($1)
+; O32-NEXT:    lw $1, 4($1)
+; O32-NEXT:    sw $1, 4($sp)
+; O32-NEXT:    ld.w $w0, 0($sp)
 ; O32-NEXT:    addv.w $w0, $w0, $w0
-; O32-NEXT:    jr $ra
 ; O32-NEXT:    copy_s.w $2, $w0[1]
+; O32-NEXT:    move $sp, $fp
+; O32-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
+; O32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; O32-NEXT:    jr $ra
+; O32-NEXT:    addiu $sp, $sp, 32
 ;
 ; N32-LABEL: extract_zext_v4i32:
 ; N32:       # %bb.0:
+; N32-NEXT:    addiu $sp, $sp, -16
 ; N32-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_zext_v4i32)))
 ; N32-NEXT:    addu $1, $1, $25
 ; N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(extract_zext_v4i32)))
 ; N32-NEXT:    lw $1, %got_disp(v4i32)($1)
-; N32-NEXT:    ld.w $w0, 0($1)
+; N32-NEXT:    lw $1, 4($1)
+; N32-NEXT:    sw $1, 4($sp)
+; N32-NEXT:    ld.w $w0, 0($sp)
 ; N32-NEXT:    addv.w $w0, $w0, $w0
-; N32-NEXT:    jr $ra
 ; N32-NEXT:    copy_s.w $2, $w0[1]
+; N32-NEXT:    jr $ra
+; N32-NEXT:    addiu $sp, $sp, 16
 ;
 ; N64-LABEL: extract_zext_v4i32:
 ; N64:       # %bb.0:
+; N64-NEXT:    daddiu $sp, $sp, -16
 ; N64-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_zext_v4i32)))
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(extract_zext_v4i32)))
 ; N64-NEXT:    ld $1, %got_disp(v4i32)($1)
-; N64-NEXT:    ld.w $w0, 0($1)
+; N64-NEXT:    lw $1, 4($1)
+; N64-NEXT:    sw $1, 4($sp)
+; N64-NEXT:    ld.w $w0, 0($sp)
 ; N64-NEXT:    addv.w $w0, $w0, $w0
-; N64-NEXT:    jr $ra
 ; N64-NEXT:    copy_s.w $2, $w0[1]
+; N64-NEXT:    jr $ra
+; N64-NEXT:    daddiu $sp, $sp, 16
   %1 = load <4 x i32>, ptr @v4i32
   %2 = add <4 x i32> %1, %1
   %3 = extractelement <4 x i32> %2, i32 1
@@ -1248,25 +1296,33 @@ define i64 @extract_zext_v2i64() nounwind {
 ;
 ; N32-LABEL: extract_zext_v2i64:
 ; N32:       # %bb.0:
+; N32-NEXT:    addiu $sp, $sp, -16
 ; N32-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_zext_v2i64)))
 ; N32-NEXT:    addu $1, $1, $25
 ; N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(extract_zext_v2i64)))
 ; N32-NEXT:    lw $1, %got_disp(v2i64)($1)
-; N32-NEXT:    ld.d $w0, 0($1)
+; N32-NEXT:    ld $1, 8($1)
+; N32-NEXT:    sd $1, 8($sp)
+; N32-NEXT:    ld.d $w0, 0($sp)
 ; N32-NEXT:    addv.d $w0, $w0, $w0
-; N32-NEXT:    jr $ra
 ; N32-NEXT:    copy_s.d $2, $w0[1]
+; N32-NEXT:    jr $ra
+; N32-NEXT:    addiu $sp, $sp, 16
 ;
 ; N64-LABEL: extract_zext_v2i64:
 ; N64:       # %bb.0:
+; N64-NEXT:    daddiu $sp, $sp, -16
 ; N64-NEXT:    lui $1, %hi(%neg(%gp_rel(extract_zext_v2i64)))
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(extract_zext_v2i64)))
 ; N64-NEXT:    ld $1, %got_disp(v2i64)($1)
-; N64-NEXT:    ld.d $w0, 0($1)
+; N64-NEXT:    ld $1, 8($1)
+; N64-NEXT:    sd $1, 8($sp)
+; N64-NEXT:    ld.d $w0, 0($sp)
 ; N64-NEXT:    addv.d $w0, $w0, $w0
-; N64-NEXT:    jr $ra
 ; N64-NEXT:    copy_s.d $2, $w0[1]
+; N64-NEXT:    jr $ra
+; N64-NEXT:    daddiu $sp, $sp, 16
   %1 = load <2 x i64>, ptr @v2i64
   %2 = add <2 x i64> %1, %1
   %3 = extractelement <2 x i64> %2, i32 1
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index ca1b5fdabbf8ff..3a82a8abd20c66 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -298,13 +298,13 @@ define i128 @srem_i128_pow2k(i128 %lhs) {
 define i128 @urem_i128_pow2k(i128 %lhs) {
 ; CHECK-LABEL: urem_i128_pow2k(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [urem_i128_pow2k_param_0];
-; CHECK-NEXT:    and.b64 %rd3, %rd1, 8589934591;
-; CHECK-NEXT:    mov.b64 %rd4, 0;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd4};
+; CHECK-NEXT:    ld.param.u64 %rd1, [urem_i128_pow2k_param_0];
+; CHECK-NEXT:    and.b64 %rd2, %rd1, 8589934591;
+; CHECK-NEXT:    mov.b64 %rd3, 0;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %div = urem i128 %lhs, 8589934592
   ret i128 %div
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index c2f166770a7ad9..e1079814a8e7a9 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -268,19 +268,19 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    bfe.s32 %r9, %r2, 24, 8;
 ; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r1, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r13, %r1, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r2, 24, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT:    bfe.u32 %r17, %r2, 16, 8;
-; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
+; CHECK-NEXT:    bfe.u32 %r11, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p4;
+; CHECK-NEXT:    bfe.u32 %r17, %r1, 16, 8;
+; CHECK-NEXT:    selp.b32 %r18, %r17, %r13, %p3;
 ; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
-; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT:    bfe.u32 %r22, %r2, 0, 8;
-; CHECK-NEXT:    selp.b32 %r23, %r11, %r22, %p1;
+; CHECK-NEXT:    bfe.u32 %r20, %r1, 8, 8;
+; CHECK-NEXT:    selp.b32 %r21, %r20, %r12, %p2;
+; CHECK-NEXT:    bfe.u32 %r22, %r1, 0, 8;
+; CHECK-NEXT:    selp.b32 %r23, %r22, %r11, %p1;
 ; CHECK-NEXT:    prmt.b32 %r24, %r23, %r21, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r19, 0x5410U;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r25;
@@ -346,19 +346,19 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    bfe.s32 %r9, %r2, 24, 8;
 ; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    setp.le.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r1, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r13, %r1, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r2, 24, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT:    bfe.u32 %r17, %r2, 16, 8;
-; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
+; CHECK-NEXT:    bfe.u32 %r11, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p4;
+; CHECK-NEXT:    bfe.u32 %r17, %r1, 16, 8;
+; CHECK-NEXT:    selp.b32 %r18, %r17, %r13, %p3;
 ; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 0x3340U;
-; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
-; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT:    bfe.u32 %r22, %r2, 0, 8;
-; CHECK-NEXT:    selp.b32 %r23, %r11, %r22, %p1;
+; CHECK-NEXT:    bfe.u32 %r20, %r1, 8, 8;
+; CHECK-NEXT:    selp.b32 %r21, %r20, %r12, %p2;
+; CHECK-NEXT:    bfe.u32 %r22, %r1, 0, 8;
+; CHECK-NEXT:    selp.b32 %r23, %r22, %r11, %p1;
 ; CHECK-NEXT:    prmt.b32 %r24, %r23, %r21, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r25, %r24, %r19, 0x5410U;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r25;
diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll b/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll
index 80c26471d8cdb0..8eaa8d7713bcd4 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll
@@ -15,9 +15,9 @@ define i32 @vec_struct_test(i32 %i, ptr nocapture readonly byval(%struct.vec_str
   ; 32BIT: bb.0.entry:
   ; 32BIT-NEXT:   liveins: $r3, $r5, $r6, $r7, $r8
   ; 32BIT-NEXT: {{  $}}
-  ; 32BIT-NEXT:   STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
-  ; 32BIT-NEXT:   STW killed renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
   ; 32BIT-NEXT:   STW renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
+  ; 32BIT-NEXT:   STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
   ; 32BIT-NEXT:   STW killed renamable $r8, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12)
   ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 killed renamable $r5, killed renamable $r3
   ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
index 7f6fdc7f88cd11..b5607e3d91e105 100644
--- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
+++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
@@ -1105,16 +1105,11 @@ define <2 x i64> @testSplati64_1(ptr nocapture readonly %ptr) #0 {
 ;
 ; CHECK-NOVSX-LABEL: testSplati64_1:
 ; CHECK-NOVSX:       # %bb.0: # %entry
-; CHECK-NOVSX-NEXT:    ld r4, 8(r3)
-; CHECK-NOVSX-NEXT:    std r4, -8(r1)
-; CHECK-NOVSX-NEXT:    ld r3, 0(r3)
+; CHECK-NOVSX-NEXT:    ld r3, 8(r3)
+; CHECK-NOVSX-NEXT:    std r3, -8(r1)
 ; CHECK-NOVSX-NEXT:    std r3, -16(r1)
 ; CHECK-NOVSX-NEXT:    addi r3, r1, -16
 ; CHECK-NOVSX-NEXT:    lvx v2, 0, r3
-; CHECK-NOVSX-NEXT:    addis r3, r2, .LCPI21_0 at toc@ha
-; CHECK-NOVSX-NEXT:    addi r3, r3, .LCPI21_0 at toc@l
-; CHECK-NOVSX-NEXT:    lvx v3, 0, r3
-; CHECK-NOVSX-NEXT:    vperm v2, v2, v2, v3
 ; CHECK-NOVSX-NEXT:    blr
 ;
 ; CHECK-P7-LABEL: testSplati64_1:
diff --git a/llvm/test/CodeGen/PowerPC/const-stov.ll b/llvm/test/CodeGen/PowerPC/const-stov.ll
index 69c68a4f27371e..c32c1ff1fc06ba 100644
--- a/llvm/test/CodeGen/PowerPC/const-stov.ll
+++ b/llvm/test/CodeGen/PowerPC/const-stov.ll
@@ -132,28 +132,29 @@ entry:
 define  <2 x i64> @i64(ptr nocapture noundef readonly %p) {
 ; PWR7-BE-LABEL: i64:
 ; PWR7-BE:       # %bb.0: # %entry
-; PWR7-BE-NEXT:    lxvd2x v2, 0, r3
+; PWR7-BE-NEXT:    lfd f0, 0(r3)
 ; PWR7-BE-NEXT:    li r3, 10
+; PWR7-BE-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; PWR7-BE-NEXT:    std r3, -16(r1)
-; PWR7-BE-NEXT:    std r3, -8(r1)
-; PWR7-BE-NEXT:    addi r3, r1, -16
-; PWR7-BE-NEXT:    lxvd2x v3, 0, r3
+; PWR7-BE-NEXT:    lfd f0, -16(r1)
+; PWR7-BE-NEXT:    xxpermdi v3, vs0, vs0, 1
 ; PWR7-BE-NEXT:    xxmrghd v2, v2, v3
 ; PWR7-BE-NEXT:    blr
 ;
 ; PWR8-BE-LABEL: i64:
 ; PWR8-BE:       # %bb.0: # %entry
-; PWR8-BE-NEXT:    lxvd2x v2, 0, r3
+; PWR8-BE-NEXT:    lfd f0, 0(r3)
 ; PWR8-BE-NEXT:    li r3, 10
+; PWR8-BE-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; PWR8-BE-NEXT:    mtfprd f0, r3
 ; PWR8-BE-NEXT:    xxmrghd v2, v2, vs0
 ; PWR8-BE-NEXT:    blr
 ;
 ; PWR8-LE-LABEL: i64:
 ; PWR8-LE:       # %bb.0: # %entry
-; PWR8-LE-NEXT:    lxvd2x vs0, 0, r3
+; PWR8-LE-NEXT:    lfd f0, 0(r3)
 ; PWR8-LE-NEXT:    li r3, 10
-; PWR8-LE-NEXT:    xxswapd v2, vs0
+; PWR8-LE-NEXT:    xxspltd v2, vs0, 0
 ; PWR8-LE-NEXT:    mtfprd f0, r3
 ; PWR8-LE-NEXT:    xxpermdi v2, vs0, v2, 1
 ; PWR8-LE-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/pr27078.ll b/llvm/test/CodeGen/PowerPC/pr27078.ll
index ee4d4ff9c6c790..beb63ce0127bcf 100644
--- a/llvm/test/CodeGen/PowerPC/pr27078.ll
+++ b/llvm/test/CodeGen/PowerPC/pr27078.ll
@@ -4,19 +4,21 @@
 define <4 x float> @bar(ptr %p, ptr %q) {
 ; CHECK-LABEL: bar:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li 5, 16
-; CHECK-NEXT:    lxvw4x 1, 0, 3
-; CHECK-NEXT:    lxvw4x 3, 0, 4
-; CHECK-NEXT:    xvsubsp 35, 3, 1
-; CHECK-NEXT:    lxvw4x 0, 3, 5
-; CHECK-NEXT:    lxvw4x 2, 4, 5
+; CHECK-NEXT:    li 5, 24
+; CHECK-NEXT:    lxvw4x 1, 0, 4
+; CHECK-NEXT:    lfiwzx 0, 3, 5
+; CHECK-NEXT:    xxmrghw 34, 0, 0
+; CHECK-NEXT:    lfiwzx 0, 4, 5
 ; CHECK-NEXT:    addis 5, 2, .LCPI0_0 at toc@ha
 ; CHECK-NEXT:    addi 5, 5, .LCPI0_0 at toc@l
 ; CHECK-NEXT:    lxvw4x 36, 0, 5
-; CHECK-NEXT:    li 5, 32
-; CHECK-NEXT:    xvsubsp 34, 2, 0
-; CHECK-NEXT:    lxvw4x 0, 3, 5
-; CHECK-NEXT:    lxvw4x 1, 4, 5
+; CHECK-NEXT:    li 5, 36
+; CHECK-NEXT:    xxmrghw 35, 0, 0
+; CHECK-NEXT:    lxvw4x 0, 0, 3
+; CHECK-NEXT:    xvsubsp 34, 35, 34
+; CHECK-NEXT:    xvsubsp 35, 1, 0
+; CHECK-NEXT:    lfiwzx 0, 3, 5
+; CHECK-NEXT:    lfiwzx 1, 4, 5
 ; CHECK-NEXT:    addis 3, 2, .LCPI0_1 at toc@ha
 ; CHECK-NEXT:    addi 3, 3, .LCPI0_1 at toc@l
 ; CHECK-NEXT:    vperm 2, 3, 2, 4
diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
index 4435484ae0b947..1a897f3498ab9f 100644
--- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
+++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll
@@ -317,24 +317,28 @@ define void @test16(ptr nocapture readonly %sums, i32 signext %delta, i32 signex
 ; P9BE-AIX32-LABEL: test16:
 ; P9BE-AIX32:       # %bb.0: # %entry
 ; P9BE-AIX32-NEXT:    slwi 4, 4, 1
-; P9BE-AIX32-NEXT:    li 6, 0
 ; P9BE-AIX32-NEXT:    lhzux 4, 3, 4
 ; P9BE-AIX32-NEXT:    lhz 3, 16(3)
-; P9BE-AIX32-NEXT:    sth 6, -64(1)
-; P9BE-AIX32-NEXT:    lxv 2, -64(1)
 ; P9BE-AIX32-NEXT:    sth 4, -48(1)
-; P9BE-AIX32-NEXT:    lxv 4, -48(1)
 ; P9BE-AIX32-NEXT:    sth 3, -32(1)
+; P9BE-AIX32-NEXT:    li 3, 0
+; P9BE-AIX32-NEXT:    sth 3, -64(1)
+; P9BE-AIX32-NEXT:    lwz 3, -32(1)
+; P9BE-AIX32-NEXT:    lxv 3, -64(1)
+; P9BE-AIX32-NEXT:    mtfprwz 0, 3
+; P9BE-AIX32-NEXT:    lwz 3, -48(1)
+; P9BE-AIX32-NEXT:    xxinsertw 2, 0, 0
+; P9BE-AIX32-NEXT:    mtfprwz 0, 3
 ; P9BE-AIX32-NEXT:    lwz 3, L..C3(2) # %const.0
-; P9BE-AIX32-NEXT:    lxv 3, -32(1)
-; P9BE-AIX32-NEXT:    vmrghh 4, 2, 4
+; P9BE-AIX32-NEXT:    vmrghh 2, 3, 2
+; P9BE-AIX32-NEXT:    xxinsertw 4, 0, 0
+; P9BE-AIX32-NEXT:    vmrghh 4, 3, 4
+; P9BE-AIX32-NEXT:    vsplth 3, 3, 0
 ; P9BE-AIX32-NEXT:    lxv 0, 0(3)
-; P9BE-AIX32-NEXT:    vmrghh 3, 2, 3
-; P9BE-AIX32-NEXT:    vsplth 2, 2, 0
-; P9BE-AIX32-NEXT:    xxmrghw 2, 2, 4
-; P9BE-AIX32-NEXT:    xxperm 3, 2, 0
-; P9BE-AIX32-NEXT:    xxspltw 2, 3, 1
-; P9BE-AIX32-NEXT:    vadduwm 2, 3, 2
+; P9BE-AIX32-NEXT:    xxmrghw 3, 3, 4
+; P9BE-AIX32-NEXT:    xxperm 2, 3, 0
+; P9BE-AIX32-NEXT:    xxspltw 3, 2, 1
+; P9BE-AIX32-NEXT:    vadduwm 2, 2, 3
 ; P9BE-AIX32-NEXT:    stxv 2, -16(1)
 ; P9BE-AIX32-NEXT:    lwz 3, -16(1)
 ; P9BE-AIX32-NEXT:    cmpw 3, 5
diff --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
index 3ab49cd39f8d80..cefe5ad7b9e774 100644
--- a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
@@ -251,9 +251,13 @@ define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_none_v16i8:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lwz r4, 0(r4)
 ; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -331,8 +335,12 @@ define <16 x i8> @test_v16i8_v8i16(i16 %arg, i8 %arg1) {
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r4, -32(r1)
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -411,8 +419,12 @@ define <16 x i8> @test_v8i16_v16i8(i16 %arg, i8 %arg1) {
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r4, -32(r1)
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -494,9 +506,13 @@ define <16 x i8> @test_none_v8i16(i16 %arg, ptr nocapture noundef readonly %b) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_none_v8i16:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lwz r4, 0(r4)
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -640,9 +656,11 @@ define <16 x i8> @test_v16i8_v4i32(i8 %arg, i32 %arg1, <16 x i8> %a, <4 x i32> %
 ; CHECK-AIX-32-P9-LABEL: test_v16i8_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghb v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -721,9 +739,11 @@ define <16 x i8> @test_v4i32_v16i8(i32 %arg, i8 %arg1) {
 ; CHECK-AIX-32-P9-LABEL: test_v4i32_v16i8:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -984,9 +1004,11 @@ define <16 x i8> @test_v16i8_v2i64(i8 %arg, i64 %arg1, <16 x i8> %a, <2 x i64> %
 ; CHECK-AIX-32-P9-LABEL: test_v16i8_v2i64:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghb v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1059,9 +1081,11 @@ define <16 x i8> @test_v2i64_v16i8(i64 %arg, i8 %arg1) {
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v16i8:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r5, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r4, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1413,8 +1437,12 @@ define <16 x i8> @test_v8i16_v8i16rhs(i16 %arg, i16 %arg1) {
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    sth r4, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v3, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1493,9 +1521,11 @@ define <16 x i8> @test_v8i16_v4i32(<8 x i16> %a, <4 x i32> %b, i16 %arg, i32 %ar
 ; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghb v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1568,9 +1598,11 @@ define <16 x i8> @test_v8i16_v2i64(<8 x i16> %a, <2 x i64> %b, i16 %arg, i64 %ar
 ; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghb v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1638,10 +1670,10 @@ define <16 x i8> @test_v4i32_v4i32(i32 %arg, i32 %arg1, <4 x i32> %a, <4 x i32>
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs0, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw v2, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1719,10 +1751,12 @@ define <16 x i8> @test_v4i32_v8i16(i32 %arg, i16 %arg1) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    sth r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1866,10 +1900,10 @@ define <16 x i8> @test_v2i64_v4i32(i64 %arg, i32 %arg1, <2 x i64> %a, <4 x i32>
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r5, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r5
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs0, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw v2, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -1941,10 +1975,12 @@ define <16 x i8> @test_v2i64_v8i16(i64 %arg, i16 %arg1) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    sth r5, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v2, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v2, v2, v3
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll
index fcfcda586694d5..e7596e8cb78884 100644
--- a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll
@@ -99,15 +99,14 @@ entry:
 define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_none_v16i8:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_none_v16i8:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P9-NEXT:    mtfprd f0, r3
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P9-NEXT:    blr
@@ -115,13 +114,13 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-BE-P8-LABEL: test_none_v16i8:
 ; CHECK-BE-P8:       # %bb.0: # %entry
 ; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P8-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_none_v16i8:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-BE-P9-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-BE-P9-NEXT:    blr
@@ -129,13 +128,13 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-AIX-64-P8-LABEL: test_none_v16i8:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
 ; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_none_v16i8:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -144,7 +143,8 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-32-P8-NEXT:    blr
@@ -153,7 +153,8 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs1, vs0, 1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -167,15 +168,14 @@ entry:
 define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_v16i8_none:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, v2, vs0, 2
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_v16i8_none:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P9-NEXT:    mtfprd f0, r3
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, v2, vs0, 2
 ; CHECK-LE-P9-NEXT:    blr
@@ -183,13 +183,13 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-BE-P8-LABEL: test_v16i8_none:
 ; CHECK-BE-P8:       # %bb.0: # %entry
 ; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P8-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_v16i8_none:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-BE-P9-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-BE-P9-NEXT:    blr
@@ -197,13 +197,13 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-AIX-64-P8-LABEL: test_v16i8_none:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
 ; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_v16i8_none:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -212,7 +212,8 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    stb r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P8-NEXT:    blr
@@ -221,7 +222,8 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs1, vs0, 1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -377,15 +379,14 @@ entry:
 define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_v8i16_none:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, v2, vs0, 2
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_v8i16_none:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P9-NEXT:    mtfprd f0, r3
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, v2, vs0, 2
 ; CHECK-LE-P9-NEXT:    blr
@@ -393,13 +394,13 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-BE-P8-LABEL: test_v8i16_none:
 ; CHECK-BE-P8:       # %bb.0: # %entry
 ; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P8-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_v8i16_none:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-BE-P9-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-BE-P9-NEXT:    blr
@@ -407,13 +408,13 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-64-P8-LABEL: test_v8i16_none:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
 ; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_v8i16_none:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -422,7 +423,8 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P8-NEXT:    blr
@@ -431,7 +433,8 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs1, vs0, 1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -445,15 +448,14 @@ entry:
 define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_none_v8i16:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    mtfprd f0, r3
+; CHECK-LE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_none_v8i16:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P9-NEXT:    mtfprd f0, r3
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P9-NEXT:    blr
@@ -461,13 +463,13 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-BE-P8-LABEL: test_none_v8i16:
 ; CHECK-BE-P8:       # %bb.0: # %entry
 ; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P8-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_none_v8i16:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-BE-P9-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-BE-P9-NEXT:    blr
@@ -475,13 +477,13 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-64-P8-LABEL: test_none_v8i16:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
 ; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_none_v8i16:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -490,7 +492,8 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    sth r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-32-P8-NEXT:    blr
@@ -499,7 +502,8 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    sth r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs1, vs0, 1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -655,15 +659,14 @@ entry:
 define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_none_v4i32:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-LE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_none_v4i32:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P9-NEXT:    blr
@@ -671,13 +674,13 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-BE-P8-LABEL: test_none_v4i32:
 ; CHECK-BE-P8:       # %bb.0: # %entry
 ; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P8-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_none_v4i32:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-BE-P9-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-BE-P9-NEXT:    blr
@@ -685,13 +688,13 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-64-P8-LABEL: test_none_v4i32:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
 ; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_none_v4i32:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, v2, vs0, 1
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -700,7 +703,8 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-32-P8-NEXT:    blr
@@ -709,7 +713,8 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs1, vs0, 1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -723,15 +728,14 @@ entry:
 define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_v4i32_none:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    mtfprwz f0, r3
+; CHECK-LE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, v2, vs0, 2
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_v4i32_none:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-LE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-LE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, v2, vs0, 2
 ; CHECK-LE-P9-NEXT:    blr
@@ -739,13 +743,13 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-BE-P8-LABEL: test_v4i32_none:
 ; CHECK-BE-P8:       # %bb.0: # %entry
 ; CHECK-BE-P8-NEXT:    mtfprwz f0, r3
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-BE-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P8-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_v4i32_none:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-BE-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-BE-P9-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-BE-P9-NEXT:    blr
@@ -753,13 +757,13 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-64-P8-LABEL: test_v4i32_none:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
 ; CHECK-AIX-64-P8-NEXT:    mtfprwz f0, r3
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-64-P8-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_v4i32_none:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-64-P9-NEXT:    lxvdsx v2, 0, r4
 ; CHECK-AIX-64-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, vs0, v2, 2
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -768,7 +772,8 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    stw r3, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r4
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r4)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P8-NEXT:    blr
@@ -777,7 +782,8 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b)
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    stw r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    lfd f1, 0(r4)
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs1, vs0, 1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:
@@ -934,12 +940,12 @@ define <2 x i64> @test_v2i64_v16i8(i8 %arg1, i64 %arg) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v16i8:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r5, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -48(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs2, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r5
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f2, r4
 ; CHECK-AIX-32-P9-NEXT:    stb r3, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs2, vs2, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs1, vs2, vs1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
@@ -955,43 +961,48 @@ entry:
 define <2 x i64> @test_none_v2i64(ptr nocapture noundef readonly %b, i64 %arg) {
 ; CHECK-LE-P8-LABEL: test_none_v2i64:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    lfd f0, 0(r3)
+; CHECK-LE-P8-NEXT:    xxspltd v2, vs0, 0
 ; CHECK-LE-P8-NEXT:    mtfprd f0, r4
 ; CHECK-LE-P8-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_none_v2i64:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-LE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-LE-P9-NEXT:    xxspltd v2, vs0, 0
 ; CHECK-LE-P9-NEXT:    mtfprd f0, r4
 ; CHECK-LE-P9-NEXT:    xxpermdi v2, vs0, v2, 1
 ; CHECK-LE-P9-NEXT:    blr
 ;
 ; CHECK-BE-P8-LABEL: test_none_v2i64:
 ; CHECK-BE-P8:       # %bb.0: # %entry
-; CHECK-BE-P8-NEXT:    lxvd2x v2, 0, r3
+; CHECK-BE-P8-NEXT:    lfd f0, 0(r3)
+; CHECK-BE-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-BE-P8-NEXT:    mtfprd f0, r4
 ; CHECK-BE-P8-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_none_v2i64:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-BE-P9-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-BE-P9-NEXT:    mtfprd f0, r4
 ; CHECK-BE-P9-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-BE-P9-NEXT:    blr
 ;
 ; CHECK-AIX-64-P8-LABEL: test_none_v2i64:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v2, 0, r3
+; CHECK-AIX-64-P8-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-64-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r4
 ; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_none_v2i64:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-64-P9-NEXT:    mtfprd f0, r4
 ; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, v2, vs0
 ; CHECK-AIX-64-P9-NEXT:    blr
@@ -1031,55 +1042,53 @@ entry:
 define <2 x i64> @test_v2i64_none(ptr nocapture noundef readonly %b, i64 %arg) {
 ; CHECK-LE-P8-LABEL: test_v2i64_none:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
+; CHECK-LE-P8-NEXT:    ld r3, 0(r3)
 ; CHECK-LE-P8-NEXT:    mtfprd f0, r4
-; CHECK-LE-P8-NEXT:    xxpermdi v2, v2, vs0, 2
+; CHECK-LE-P8-NEXT:    mtfprd f1, r3
+; CHECK-LE-P8-NEXT:    xxmrghd v2, vs1, vs0
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_v2i64_none:
 ; CHECK-LE-P9:       # %bb.0: # %entry
-; CHECK-LE-P9-NEXT:    lxv v2, 0(r3)
-; CHECK-LE-P9-NEXT:    mtfprd f0, r4
-; CHECK-LE-P9-NEXT:    xxpermdi v2, v2, vs0, 2
+; CHECK-LE-P9-NEXT:    ld r3, 0(r3)
+; CHECK-LE-P9-NEXT:    mtvsrdd v2, r3, r4
 ; CHECK-LE-P9-NEXT:    blr
 ;
 ; CHECK-BE-P8-LABEL: test_v2i64_none:
 ; CHECK-BE-P8:       # %bb.0: # %entry
+; CHECK-BE-P8-NEXT:    ld r3, 0(r3)
 ; CHECK-BE-P8-NEXT:    mtfprd f0, r4
-; CHECK-BE-P8-NEXT:    lxvd2x v3, 0, r3
-; CHECK-BE-P8-NEXT:    xxspltd v2, vs0, 0
-; CHECK-BE-P8-NEXT:    xxmrghd v2, v2, v3
+; CHECK-BE-P8-NEXT:    mtfprd f1, r3
+; CHECK-BE-P8-NEXT:    xxmrghd v2, vs0, vs1
 ; CHECK-BE-P8-NEXT:    blr
 ;
 ; CHECK-BE-P9-LABEL: test_v2i64_none:
 ; CHECK-BE-P9:       # %bb.0: # %entry
-; CHECK-BE-P9-NEXT:    lxv v2, 0(r3)
-; CHECK-BE-P9-NEXT:    mtvsrdd v3, r4, r4
-; CHECK-BE-P9-NEXT:    xxmrghd v2, v3, v2
+; CHECK-BE-P9-NEXT:    ld r3, 0(r3)
+; CHECK-BE-P9-NEXT:    mtvsrdd v2, r4, r3
 ; CHECK-BE-P9-NEXT:    blr
 ;
 ; CHECK-AIX-64-P8-LABEL: test_v2i64_none:
 ; CHECK-AIX-64-P8:       # %bb.0: # %entry
+; CHECK-AIX-64-P8-NEXT:    ld r3, 0(r3)
 ; CHECK-AIX-64-P8-NEXT:    mtfprd f0, r4
-; CHECK-AIX-64-P8-NEXT:    lxvd2x v3, 0, r3
-; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs0
-; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, v2, v3
+; CHECK-AIX-64-P8-NEXT:    mtfprd f1, r3
+; CHECK-AIX-64-P8-NEXT:    xxmrghd v2, vs0, vs1
 ; CHECK-AIX-64-P8-NEXT:    blr
 ;
 ; CHECK-AIX-64-P9-LABEL: test_v2i64_none:
 ; CHECK-AIX-64-P9:       # %bb.0: # %entry
-; CHECK-AIX-64-P9-NEXT:    lxv v2, 0(r3)
-; CHECK-AIX-64-P9-NEXT:    mtvsrdd v3, r4, r4
-; CHECK-AIX-64-P9-NEXT:    xxmrghd v2, v3, v2
+; CHECK-AIX-64-P9-NEXT:    ld r3, 0(r3)
+; CHECK-AIX-64-P9-NEXT:    mtvsrdd v2, r4, r3
 ; CHECK-AIX-64-P9-NEXT:    blr
 ;
 ; CHECK-AIX-32-P8-LABEL: test_v2i64_none:
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
-; CHECK-AIX-32-P8-NEXT:    lxvd2x v2, 0, r3
+; CHECK-AIX-32-P8-NEXT:    lfd f0, 0(r3)
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -16
 ; CHECK-AIX-32-P8-NEXT:    stw r5, -16(r1)
 ; CHECK-AIX-32-P8-NEXT:    stw r4, -32(r1)
+; CHECK-AIX-32-P8-NEXT:    xxpermdi v2, vs0, vs0, 1
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    addi r3, r1, -32
 ; CHECK-AIX-32-P8-NEXT:    lxvw4x vs1, 0, r3
@@ -1089,11 +1098,12 @@ define <2 x i64> @test_v2i64_none(ptr nocapture noundef readonly %b, i64 %arg) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_none:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    lxv v2, 0(r3)
-; CHECK-AIX-32-P9-NEXT:    stw r5, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lfd f0, 0(r3)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
+; CHECK-AIX-32-P9-NEXT:    xxpermdi v2, vs0, vs0, 1
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r5
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs0, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs0, v2
 ; CHECK-AIX-32-P9-NEXT:    blr
@@ -1536,13 +1546,13 @@ define <2 x i64> @test_v4i32_v2i64(i32 %arg1, i64 %arg) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r4, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r4
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r3
 ; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C0(r2) # %const.0
-; CHECK-AIX-32-P9-NEXT:    lxv vs0, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -48(r1)
 ; CHECK-AIX-32-P9-NEXT:    stw r5, -16(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv v2, -16(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs0, vs0, 0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw v3, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    xxperm v2, v3, vs0
@@ -1713,12 +1723,12 @@ define <2 x i64> @test_v2i64_v4i32(i64 %arg1, i32 %arg) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f2, r3
 ; CHECK-AIX-32-P9-NEXT:    stw r5, -48(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs2, vs2, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs1, vs2, vs1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
@@ -1793,12 +1803,12 @@ define <2 x i64> @test_v2i64_v8i16(i64 %arg1, i16 %arg) {
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
-; CHECK-AIX-32-P9-NEXT:    stw r4, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    stw r3, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs1, -16(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv vs2, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f1, r4
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f2, r3
 ; CHECK-AIX-32-P9-NEXT:    sth r5, -48(r1)
 ; CHECK-AIX-32-P9-NEXT:    lxv vs0, -48(r1)
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs1, vs1, 0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw vs2, vs2, 0
 ; CHECK-AIX-32-P9-NEXT:    xxmrghw vs1, vs2, vs1
 ; CHECK-AIX-32-P9-NEXT:    xxmrghd v2, vs1, vs0
 ; CHECK-AIX-32-P9-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
index 47ffdb4625ed39..a4aa8eac2033dc 100644
--- a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
@@ -339,9 +339,11 @@ define void @test_none_v4i32(ptr %ptr, ptr %ptr2, i8 %v3) local_unnamed_addr #0
 ; CHECK-AIX-32-P9-LABEL: test_none_v4i32:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    lxsiwzx v2, 0, r3
-; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C2(r2) # %const.0
 ; CHECK-AIX-32-P9-NEXT:    stb r5, -32(r1)
-; CHECK-AIX-32-P9-NEXT:    lxv v3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    lwz r3, -32(r1)
+; CHECK-AIX-32-P9-NEXT:    mtfprwz f0, r3
+; CHECK-AIX-32-P9-NEXT:    lwz r3, L..C2(r2) # %const.0
+; CHECK-AIX-32-P9-NEXT:    xxinsertw v3, vs0, 0
 ; CHECK-AIX-32-P9-NEXT:    lxv v4, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    vmrghh v3, v3, v3
 ; CHECK-AIX-32-P9-NEXT:    vperm v2, v2, v3, v4
diff --git a/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll b/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll
index 66c1b6f6d26daa..cc2fe5604a3714 100644
--- a/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll
@@ -45,18 +45,16 @@ define <2 x double> @test01(ptr %p1, ptr %p2) {
 define <2 x double> @test02(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test02:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xxmrgld 34, 1, 0
+; CHECK-NEXT:    lfd 0, 0(3)
+; CHECK-NEXT:    lfd 1, 0(4)
+; CHECK-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test02:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
-; CHECK-P9-NEXT:    xxmrgld 34, 1, 0
+; CHECK-P9-NEXT:    lfd 0, 0(3)
+; CHECK-P9-NEXT:    lfd 1, 0(4)
+; CHECK-P9-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
   %v2 = load <2 x double>, ptr %p2
@@ -67,18 +65,16 @@ define <2 x double> @test02(ptr %p1, ptr %p2) {
 define <2 x double> @test03(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test03:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xxpermdi 34, 1, 0, 1
+; CHECK-NEXT:    lfd 0, 0(3)
+; CHECK-NEXT:    lfd 1, 8(4)
+; CHECK-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test03:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
-; CHECK-P9-NEXT:    xxpermdi 34, 1, 0, 1
+; CHECK-P9-NEXT:    lfd 0, 0(3)
+; CHECK-P9-NEXT:    lfd 1, 8(4)
+; CHECK-P9-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
   %v2 = load <2 x double>, ptr %p2
@@ -123,18 +119,16 @@ define <2 x double> @test11(ptr %p1, ptr %p2) {
 define <2 x double> @test12(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test12:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xxpermdi 34, 1, 0, 2
+; CHECK-NEXT:    lfd 0, 8(3)
+; CHECK-NEXT:    lfd 1, 0(4)
+; CHECK-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test12:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
-; CHECK-P9-NEXT:    xxpermdi 34, 1, 0, 2
+; CHECK-P9-NEXT:    lfd 0, 8(3)
+; CHECK-P9-NEXT:    lfd 1, 0(4)
+; CHECK-P9-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
   %v2 = load <2 x double>, ptr %p2
@@ -145,17 +139,15 @@ define <2 x double> @test12(ptr %p1, ptr %p2) {
 define <2 x double> @test13(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test13:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
+; CHECK-NEXT:    lfd 0, 8(3)
+; CHECK-NEXT:    lfd 1, 8(4)
 ; CHECK-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test13:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
+; CHECK-P9-NEXT:    lfd 0, 8(3)
+; CHECK-P9-NEXT:    lfd 1, 8(4)
 ; CHECK-P9-NEXT:    xxmrghd 34, 1, 0
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
@@ -167,18 +159,16 @@ define <2 x double> @test13(ptr %p1, ptr %p2) {
 define <2 x double> @test20(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test20:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xxmrgld 34, 0, 1
+; CHECK-NEXT:    lfd 0, 0(3)
+; CHECK-NEXT:    lfd 1, 0(4)
+; CHECK-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test20:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
-; CHECK-P9-NEXT:    xxmrgld 34, 0, 1
+; CHECK-P9-NEXT:    lfd 0, 0(3)
+; CHECK-P9-NEXT:    lfd 1, 0(4)
+; CHECK-P9-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
   %v2 = load <2 x double>, ptr %p2
@@ -189,18 +179,16 @@ define <2 x double> @test20(ptr %p1, ptr %p2) {
 define <2 x double> @test21(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test21:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xxpermdi 34, 0, 1, 1
+; CHECK-NEXT:    lfd 0, 8(3)
+; CHECK-NEXT:    lfd 1, 0(4)
+; CHECK-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test21:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
-; CHECK-P9-NEXT:    xxpermdi 34, 0, 1, 1
+; CHECK-P9-NEXT:    lfd 0, 8(3)
+; CHECK-P9-NEXT:    lfd 1, 0(4)
+; CHECK-P9-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
   %v2 = load <2 x double>, ptr %p2
@@ -244,18 +232,16 @@ define <2 x double> @test23(ptr %p1, ptr %p2) {
 define <2 x double> @test30(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test30:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xxpermdi 34, 0, 1, 2
+; CHECK-NEXT:    lfd 0, 0(3)
+; CHECK-NEXT:    lfd 1, 8(4)
+; CHECK-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test30:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
-; CHECK-P9-NEXT:    xxpermdi 34, 0, 1, 2
+; CHECK-P9-NEXT:    lfd 0, 0(3)
+; CHECK-P9-NEXT:    lfd 1, 8(4)
+; CHECK-P9-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
   %v2 = load <2 x double>, ptr %p2
@@ -266,17 +252,15 @@ define <2 x double> @test30(ptr %p1, ptr %p2) {
 define <2 x double> @test31(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: test31:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    lxvd2x 1, 0, 4
-; CHECK-NEXT:    xxswapd 0, 0
-; CHECK-NEXT:    xxswapd 1, 1
+; CHECK-NEXT:    lfd 0, 8(3)
+; CHECK-NEXT:    lfd 1, 8(4)
 ; CHECK-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test31:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    lxv 0, 0(3)
-; CHECK-P9-NEXT:    lxv 1, 0(4)
+; CHECK-P9-NEXT:    lfd 0, 8(3)
+; CHECK-P9-NEXT:    lfd 1, 8(4)
 ; CHECK-P9-NEXT:    xxmrghd 34, 0, 1
 ; CHECK-P9-NEXT:    blr
   %v1 = load <2 x double>, ptr %p1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 41d8abb9b73ebc..b1735ec832d9bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
 
 define <4 x bfloat> @shuffle_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; CHECK-LABEL: shuffle_v4bf16:
@@ -385,12 +385,19 @@ define <4 x half> @vrgather_shuffle_vv_v4f16(<4 x half> %x, <4 x half> %y) {
 }
 
 define <4 x half> @vrgather_shuffle_vx_v4f16_load(ptr %p) {
-; CHECK-LABEL: vrgather_shuffle_vx_v4f16_load:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lh a0, 2(a0)
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vrgather_shuffle_vx_v4f16_load:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    flh fa5, 2(a0)
+; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfmv.v.f v8, fa5
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vrgather_shuffle_vx_v4f16_load:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    lh a0, 2(a0)
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    ret
   %v = load <4 x half>, ptr %p
   %s = shufflevector <4 x half> %v, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x half> %s
diff --git a/llvm/test/CodeGen/Thumb2/mve-extractstore.ll b/llvm/test/CodeGen/Thumb2/mve-extractstore.ll
index 941ae78cc9a79a..18ff9034a45303 100644
--- a/llvm/test/CodeGen/Thumb2/mve-extractstore.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-extractstore.ll
@@ -5,10 +5,9 @@ define half @extret1_f16_sf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) {
 ; CHECK-LABEL: extret1_f16_sf:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmov d0, r0, r1
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vadd.f16 q0, q0, r0
 ; CHECK-NEXT:    ldr r0, [sp, #16]
-; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vmovx.f16 s0, s0
 ; CHECK-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NEXT:    vmov r0, s0
@@ -22,11 +21,10 @@ define half @extret1_f16_sf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) {
 define half @extret4_f16_sf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) {
 ; CHECK-LABEL: extret4_f16_sf:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    ldrh.w r0, [sp, #8]
 ; CHECK-NEXT:    vmov d1, r2, r3
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vadd.f16 q0, q0, r0
 ; CHECK-NEXT:    ldr r0, [sp, #16]
-; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstr.16 s2, [r0]
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    bx lr
@@ -98,11 +96,10 @@ define arm_aapcs_vfpcc <8 x half> @extret4_v8f16_hf(<8 x half> %a, <8 x half> %b
 define float @extret1_f32_sf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) {
 ; CHECK-LABEL: extret1_f32_sf:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov d0, r0, r1
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldr s1, [sp, #4]
+; CHECK-NEXT:    vmov d2, r0, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16]
-; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    vadd.f32 q0, q1, q0
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vstr s1, [r1]
 ; CHECK-NEXT:    bx lr
@@ -115,11 +112,10 @@ define float @extret1_f32_sf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) {
 define float @extret2_f32_sf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) {
 ; CHECK-LABEL: extret2_f32_sf:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    mov r0, sp
-; CHECK-NEXT:    vmov d1, r2, r3
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldr s2, [sp, #8]
+; CHECK-NEXT:    vmov d3, r2, r3
 ; CHECK-NEXT:    ldr r1, [sp, #16]
-; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    vadd.f32 q0, q1, q0
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vstr s2, [r1]
 ; CHECK-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
index 5f56a82f3c5110..78c754f712bfab 100644
--- a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
@@ -128,9 +128,11 @@ define <8 x i16> @inserti8_last_zext(ptr %p) {
 define <8 x i32> @inserti32_first(ptr %p) {
 ; CHECKLE-LABEL: inserti32_first:
 ; CHECKLE:       @ %bb.0:
+; CHECKLE-NEXT:    ldr r1, [r0, #16]
 ; CHECKLE-NEXT:    vldrw.u32 q2, [r0, #20]
-; CHECKLE-NEXT:    vldr s4, [r0, #16]
 ; CHECKLE-NEXT:    vldrw.u32 q0, [r0]
+; CHECKLE-NEXT:    vmov.32 q1[3], r1
+; CHECKLE-NEXT:    vmov.f32 s4, s7
 ; CHECKLE-NEXT:    vmov.f32 s5, s8
 ; CHECKLE-NEXT:    vmov.f32 s6, s9
 ; CHECKLE-NEXT:    vmov.f32 s7, s10
@@ -138,14 +140,16 @@ define <8 x i32> @inserti32_first(ptr %p) {
 ;
 ; CHECKBE-LABEL: inserti32_first:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrw.u32 q3, [r0, #20]
 ; CHECKBE-NEXT:    vldrb.u8 q1, [r0]
-; CHECKBE-NEXT:    vldr s8, [r0, #16]
-; CHECKBE-NEXT:    vmov.f32 s9, s12
+; CHECKBE-NEXT:    ldr r1, [r0, #16]
+; CHECKBE-NEXT:    vldrw.u32 q2, [r0, #20]
 ; CHECKBE-NEXT:    vrev64.8 q0, q1
-; CHECKBE-NEXT:    vmov.f32 s10, s13
-; CHECKBE-NEXT:    vmov.f32 s11, s14
-; CHECKBE-NEXT:    vrev64.32 q1, q2
+; CHECKBE-NEXT:    vmov.32 q1[3], r1
+; CHECKBE-NEXT:    vmov.f32 s12, s7
+; CHECKBE-NEXT:    vmov.f32 s13, s8
+; CHECKBE-NEXT:    vmov.f32 s14, s9
+; CHECKBE-NEXT:    vmov.f32 s15, s10
+; CHECKBE-NEXT:    vrev64.32 q1, q3
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 4
   %l1 = load <8 x i32>, ptr %q
@@ -158,24 +162,28 @@ define <8 x i32> @inserti32_first(ptr %p) {
 define <8 x i32> @inserti32_last(ptr %p) {
 ; CHECKLE-LABEL: inserti32_last:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrw.u32 q2, [r0]
-; CHECKLE-NEXT:    vldr s3, [r0, #16]
+; CHECKLE-NEXT:    ldr r1, [r0, #16]
+; CHECKLE-NEXT:    vldrw.u32 q0, [r0]
 ; CHECKLE-NEXT:    vldrw.u32 q1, [r0, #20]
-; CHECKLE-NEXT:    vmov.f32 s0, s9
-; CHECKLE-NEXT:    vmov.f32 s1, s10
-; CHECKLE-NEXT:    vmov.f32 s2, s11
+; CHECKLE-NEXT:    vmov.32 q2[0], r1
+; CHECKLE-NEXT:    vmov.f32 s0, s1
+; CHECKLE-NEXT:    vmov.f32 s1, s2
+; CHECKLE-NEXT:    vmov.f32 s2, s3
+; CHECKLE-NEXT:    vmov.f32 s3, s8
 ; CHECKLE-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: inserti32_last:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrw.u32 q3, [r0]
-; CHECKBE-NEXT:    vldrb.u8 q0, [r0, #20]
-; CHECKBE-NEXT:    vldr s11, [r0, #16]
-; CHECKBE-NEXT:    vmov.f32 s8, s13
-; CHECKBE-NEXT:    vrev64.8 q1, q0
-; CHECKBE-NEXT:    vmov.f32 s9, s14
-; CHECKBE-NEXT:    vmov.f32 s10, s15
-; CHECKBE-NEXT:    vrev64.32 q0, q2
+; CHECKBE-NEXT:    ldr r1, [r0, #16]
+; CHECKBE-NEXT:    vldrw.u32 q0, [r0]
+; CHECKBE-NEXT:    vldrb.u8 q2, [r0, #20]
+; CHECKBE-NEXT:    vmov.32 q1[0], r1
+; CHECKBE-NEXT:    vmov.f32 s12, s1
+; CHECKBE-NEXT:    vmov.f32 s15, s4
+; CHECKBE-NEXT:    vrev64.8 q1, q2
+; CHECKBE-NEXT:    vmov.f32 s13, s2
+; CHECKBE-NEXT:    vmov.f32 s14, s3
+; CHECKBE-NEXT:    vrev64.32 q0, q3
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 32
   %l1 = load <8 x i32>, ptr %p
diff --git a/llvm/test/CodeGen/X86/SwizzleShuff.ll b/llvm/test/CodeGen/X86/SwizzleShuff.ll
index 0cfafdd86863e5..9f3dffd75cfa87 100644
--- a/llvm/test/CodeGen/X86/SwizzleShuff.ll
+++ b/llvm/test/CodeGen/X86/SwizzleShuff.ll
@@ -19,7 +19,7 @@ define void @pull_bitcast(ptr %pA, ptr %pB) {
 define <4 x i32> @multi_use_swizzle(ptr %pA, ptr %pB) {
 ; CHECK-LABEL: multi_use_swizzle:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,2]
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[1,3,2,2]
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,1,0,2]
diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
index 0bfd8921e8b42a..e5eb366fe22e30 100644
--- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
@@ -370,12 +370,12 @@ define <4 x i32> @load_splat_4i32_4i32_1111(ptr %ptr) nounwind uwtable readnone
 ; X86-LABEL: load_splat_4i32_4i32_1111:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
+; X86-NEXT:    vbroadcastss 4(%eax), %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: load_splat_4i32_4i32_1111:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
+; X64-NEXT:    vbroadcastss 4(%rdi), %xmm0
 ; X64-NEXT:    retq
 entry:
   %ld = load <4 x i32>, ptr %ptr
@@ -477,7 +477,7 @@ define <2 x i64> @load_splat_2i64_2i64_1111(ptr %ptr) nounwind uwtable readnone
 ;
 ; X64-LABEL: load_splat_2i64_2i64_1111:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
+; X64-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; X64-NEXT:    retq
 entry:
   %ld = load <2 x i64>, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll
index 4ce092c099b088..3b52773db86739 100644
--- a/llvm/test/CodeGen/X86/avx.ll
+++ b/llvm/test/CodeGen/X86/avx.ll
@@ -81,13 +81,15 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    vinsertps $0, 12(%eax,%ecx), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
+; X86-NEXT:    vmovss 12(%eax,%ecx), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset_2:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    shlq $4, %rsi
-; X64-NEXT:    vinsertps $0, 12(%rdi,%rsi), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
+; X64-NEXT:    vmovss 12(%rdi,%rsi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>, ptr %pb, i64 %index
   %2 = load <4 x float>, ptr %1, align 16
diff --git a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll
index e971d1e471bf7a..49cc58c8e73a8a 100644
--- a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll
+++ b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll
@@ -9,14 +9,16 @@ define void @test1(ptr %A, ptr %C) #0 {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; X86-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test1:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
 ; X64-NEXT:    retq
   %tmp2 = load <8 x float>, ptr %A, align 32
@@ -34,15 +36,15 @@ define void @test2(ptr %A, ptr %C) #0 {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %xmm0
-; X86-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = [2147483647,0,0,0]
+; X86-NEXT:    vorps (%ecx), %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test2:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vmovaps (%rdi), %xmm0
-; X64-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = [2147483647,0,0,0]
+; X64-NEXT:    vorps (%rdi), %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
 ; X64-NEXT:    retq
   %tmp2 = load <8 x float>, ptr %A, align 32
@@ -60,15 +62,15 @@ define void @test3(ptr %A, ptr %C) #0 {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %xmm0
-; X86-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = [2147483647,0,0,0]
+; X86-NEXT:    vxorps (%ecx), %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test3:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vmovaps (%rdi), %xmm0
-; X64-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = [2147483647,0,0,0]
+; X64-NEXT:    vxorps (%rdi), %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
 ; X64-NEXT:    retq
   %tmp2 = load <8 x float>, ptr %A, align 32
@@ -86,14 +88,16 @@ define void @test4(ptr %A, ptr %C) #0 {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovaps (%ecx), %xmm0
-; X86-NEXT:    vandnps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = [2147483647,0,0,0]
+; X86-NEXT:    vandnps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test4:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    vmovaps (%rdi), %xmm0
-; X64-NEXT:    vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vmovss {{.*#+}} xmm1 = [2147483647,0,0,0]
+; X64-NEXT:    vandnps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovss %xmm0, (%rsi)
 ; X64-NEXT:    retq
   %tmp2 = load <8 x float>, ptr %A, align 32
diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll
index 0fae921b1ca83d..20550fc4eb9fa6 100644
--- a/llvm/test/CodeGen/X86/avx512-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-arith.ll
@@ -1158,7 +1158,7 @@ define <16 x i32> @masked_inc_test(<16 x i32> %i, <16 x i32> %mask1) nounwind re
 ; CHECK-LABEL: masked_inc_test:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -1171,7 +1171,7 @@ define <16 x i32> @masked_dec_test(<16 x i32> %i, <16 x i32> %mask1) nounwind re
 ; CHECK-LABEL: masked_dec_test:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
index 7e48b3719cf0ff..5d7c337c94dee2 100644
--- a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
@@ -18,18 +18,18 @@ define <64 x i8> @add_v64i8_broadcasts(<64 x i8> %a0, i64 %a1, i8 %a2) {
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    kmovw %eax, %k3
 ; AVX512F-NEXT:    kmovw %edi, %k4
-; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k4} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k4} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
 ; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 {%k2} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
 ; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vpternlogq $216, %zmm2, %zmm1, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
@@ -37,7 +37,7 @@ define <64 x i8> @add_v64i8_broadcasts(<64 x i8> %a0, i64 %a1, i8 %a2) {
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $226, %zmm4, %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm2 & (zmm0 ^ zmm4))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: add_v64i8_broadcasts:
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index c27cced9d5ffa7..23b46ee59154fb 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -166,7 +166,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
 ; KNL-NEXT:    pushq %rax
 ; KNL-NEXT:    .cfi_def_cfa_offset 16
 ; KNL-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    callq _func16xi1
 ; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -194,7 +194,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
 ; KNL_X32-NEXT:    subl $12, %esp
 ; KNL_X32-NEXT:    .cfi_def_cfa_offset 16
 ; KNL_X32-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL_X32-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL_X32-NEXT:    calll _func16xi1
 ; KNL_X32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
diff --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll
index e46378804be5e5..56d6d136fdb614 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp.ll
@@ -190,7 +190,7 @@ define <8 x i32> @legalize_loop(<8 x double> %arg) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; KNL-NEXT:    vcmpnltpd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[3,2,1,0,7,6,5,4]
 ; KNL-NEXT:    vpsrld $31, %ymm1, %ymm1
 ; KNL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index 79e59fdcf4a125..d19eaf4459fee5 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -1603,7 +1603,7 @@ define   <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
 ; KNL-LABEL: zext_16i1_to_16xi32:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpsrld $31, %zmm0, %zmm0
 ; KNL-NEXT:    retq
 ;
@@ -1629,7 +1629,7 @@ define   <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
 ; KNL-LABEL: zext_8i1_to_8xi64:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpsrlq $63, %zmm0, %zmm0
 ; KNL-NEXT:    retq
 ;
@@ -1747,14 +1747,14 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
 ; KNL-LABEL: sext_8i1_8i32:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; KNL-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; KNL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sext_8i1_8i32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; AVX512DQ-NEXT:    retq
   %x = icmp slt <8 x i32> %a1, %a2
   %x1 = xor <8 x i1>%x, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
@@ -1840,7 +1840,7 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
 ; KNL-LABEL: sext_16i1_16i32:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sext_16i1_16i32:
@@ -2313,12 +2313,12 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    korw %k2, %k0, %k2
-; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
-; KNL-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm4 {%k1} {z} = -1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdw %zmm2, %ymm2
 ; KNL-NEXT:    vpmovdw %zmm3, %ymm3
 ; KNL-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
index 6c661eb771d1bd..5115c3cdc259a7 100644
--- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
@@ -485,7 +485,7 @@ define void @load_v64i1_broadcast_32_v16i1(ptr %a0,<16 x float> %a1,<16 x float>
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
-; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512NOTDQ-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; AVX512NOTDQ-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
@@ -642,7 +642,7 @@ define void @load_v64i1_broadcast_63_v16i1(ptr %a0,<16 x float> %a1,<16 x float>
 ; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1:
 ; AVX512NOTDQ-FAST:       # %bb.0:
 ; AVX512NOTDQ-FAST-NEXT:    kmovw 6(%rdi), %k1
-; AVX512NOTDQ-FAST-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-FAST-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512NOTDQ-FAST-NEXT:    vpermd %zmm2, %zmm3, %zmm2
 ; AVX512NOTDQ-FAST-NEXT:    vptestmd %zmm2, %zmm2, %k1
@@ -654,7 +654,7 @@ define void @load_v64i1_broadcast_63_v16i1(ptr %a0,<16 x float> %a1,<16 x float>
 ; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1:
 ; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k1
-; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7]
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %zmm2, %zmm2, %k1
@@ -1426,7 +1426,7 @@ define void @load_v64i1_broadcast_32_v16i1_store(ptr %a0,ptr %a1) {
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
-; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %zmm0
 ; AVX512NOTDQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512NOTDQ-NEXT:    kmovw %k0, (%rsi)
@@ -1596,7 +1596,7 @@ define void @load_v64i1_broadcast_63_v16i1_store(ptr %a0,ptr %a1) {
 ; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store:
 ; AVX512NOTDQ-FAST:       # %bb.0:
 ; AVX512NOTDQ-FAST-NEXT:    kmovw 6(%rdi), %k1
-; AVX512NOTDQ-FAST-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-FAST-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512NOTDQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512NOTDQ-FAST-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1607,7 +1607,7 @@ define void @load_v64i1_broadcast_63_v16i1_store(ptr %a0,ptr %a1) {
 ; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store:
 ; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k1
-; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %zmm0, %zmm0, %k0
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index a8574c0b7516c1..80b4ae49420973 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -5824,8 +5824,10 @@ define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %
 ;
 ; X64-LABEL: test_mm_mask_fmsub_round_sd:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    vxorpd %xmm3, %xmm2, %xmm2
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <2 x double> %__W, i64 0
@@ -5876,8 +5878,10 @@ define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double>
 ;
 ; X64-LABEL: test_mm_maskz_fmsub_round_sd:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    vxorpd %xmm3, %xmm2, %xmm2
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <2 x double> %__A, i64 0
@@ -5932,8 +5936,11 @@ define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double
 ;
 ; X64-LABEL: test_mm_mask3_fmsub_round_sd:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    vxorpd %xmm3, %xmm2, %xmm3
+; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm3, %xmm0, %xmm1
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; X64-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
 ; X64-NEXT:    vmovapd %xmm2, %xmm0
 ; X64-NEXT:    retq
 entry:
@@ -5986,8 +5993,10 @@ define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext
 ;
 ; X64-LABEL: test_mm_mask_fnmadd_round_sd:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <2 x double> %__W, i64 0
@@ -6038,8 +6047,10 @@ define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double>
 ;
 ; X64-LABEL: test_mm_maskz_fnmadd_round_sd:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <2 x double> %__A, i64 0
@@ -6094,8 +6105,10 @@ define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x doubl
 ;
 ; X64-LABEL: test_mm_mask3_fnmadd_round_sd:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; X64-NEXT:    vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; X64-NEXT:    vmovapd %xmm2, %xmm0
 ; X64-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/avx512-load-store.ll b/llvm/test/CodeGen/X86/avx512-load-store.ll
index c32c3d9b855039..ce6bfa90d88a79 100644
--- a/llvm/test/CodeGen/X86/avx512-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx512-load-store.ll
@@ -143,7 +143,7 @@ define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, ptr
 ; CHECK64-LABEL: test_mm_mask_load_ss:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_mask_load_ss:
@@ -151,7 +151,7 @@ define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, ptr
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
   %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
@@ -168,7 +168,7 @@ define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, pt
 ; CHECK64-LABEL: test_mm_mask_load_sd:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_mask_load_sd:
@@ -176,7 +176,7 @@ define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, pt
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
   %shuffle5.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
@@ -192,7 +192,7 @@ define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, ptr %__W) local_unnam
 ; CHECK64-LABEL: test_mm_maskz_load_ss:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_ss:
@@ -200,7 +200,7 @@ define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, ptr %__W) local_unnam
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
@@ -215,7 +215,7 @@ define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, ptr %__W) local_unna
 ; CHECK64-LABEL: test_mm_maskz_load_sd:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_sd:
@@ -223,7 +223,7 @@ define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, ptr %__W) local_unna
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
@@ -283,7 +283,7 @@ define <4 x float> @test_mm_mask_load_ss_2(<4 x float> %__A, i8 zeroext %__U, pt
 ; CHECK64-LABEL: test_mm_mask_load_ss_2:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_mask_load_ss_2:
@@ -291,7 +291,7 @@ define <4 x float> @test_mm_mask_load_ss_2(<4 x float> %__A, i8 zeroext %__U, pt
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
   %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
@@ -306,7 +306,7 @@ define <4 x float> @test_mm_maskz_load_ss_2(i8 zeroext %__U, ptr readonly %__W)
 ; CHECK64-LABEL: test_mm_maskz_load_ss_2:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_ss_2:
@@ -314,7 +314,7 @@ define <4 x float> @test_mm_maskz_load_ss_2(i8 zeroext %__U, ptr readonly %__W)
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
@@ -328,7 +328,7 @@ define <2 x double> @test_mm_mask_load_sd_2(<2 x double> %__A, i8 zeroext %__U,
 ; CHECK64-LABEL: test_mm_mask_load_sd_2:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_mask_load_sd_2:
@@ -336,7 +336,7 @@ define <2 x double> @test_mm_mask_load_sd_2(<2 x double> %__A, i8 zeroext %__U,
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
   %shuffle3.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
@@ -351,7 +351,7 @@ define <2 x double> @test_mm_maskz_load_sd_2(i8 zeroext %__U, ptr readonly %__W)
 ; CHECK64-LABEL: test_mm_maskz_load_sd_2:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_sd_2:
@@ -359,7 +359,7 @@ define <2 x double> @test_mm_maskz_load_sd_2(i8 zeroext %__U, ptr readonly %__W)
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll
index e53e194ba05c2a..23f4fcb1c77c64 100644
--- a/llvm/test/CodeGen/X86/avx512-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512-logic.ll
@@ -856,7 +856,7 @@ entry:
 define <16 x i32> @ternlog_and_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) {
 ; ALL-LABEL: ternlog_and_andn:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd $8, %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm2 & zmm1 & ~zmm0
 ; ALL-NEXT:    retq
   %a = xor <16 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   %b = and <16 x i32> %y, %a
@@ -867,7 +867,7 @@ define <16 x i32> @ternlog_and_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 define <16 x i32> @ternlog_or_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) {
 ; ALL-LABEL: ternlog_or_andn:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd $206, %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = (zmm1 & ~zmm0) | zmm2
 ; ALL-NEXT:    retq
   %a = xor <16 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   %b = and <16 x i32> %y, %a
@@ -878,7 +878,7 @@ define <16 x i32> @ternlog_or_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 define <16 x i32> @ternlog_xor_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) {
 ; ALL-LABEL: ternlog_xor_andn:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd $198, %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm2 ^ (zmm1 & ~zmm0)
 ; ALL-NEXT:    retq
   %a = xor <16 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   %b = and <16 x i32> %y, %a
@@ -889,7 +889,7 @@ define <16 x i32> @ternlog_xor_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) {
 ; ALL-LABEL: ternlog_or_and_mask:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = (zmm0 & mem) | zmm1
 ; ALL-NEXT:    retq
   %a = and <16 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %b = or <16 x i32> %a, %y
@@ -899,7 +899,7 @@ define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) {
 define <8 x i64> @ternlog_xor_and_mask(<8 x i64> %x, <8 x i64> %y) {
 ; ALL-LABEL: ternlog_xor_and_mask:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
+; ALL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
 ; ALL-NEXT:    retq
   %a = and <8 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   %b = xor <8 x i64> %a, %y
@@ -911,7 +911,7 @@ define <16 x i32> @ternlog_maskz_or_and_mask(<16 x i32> %x, <16 x i32> %y, <16 x
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm3
 ; ALL-NEXT:    vpsrad $31, %zmm2, %zmm0
-; ALL-NEXT:    vpternlogd $224, %zmm1, %zmm3, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 & (zmm3 | zmm1)
 ; ALL-NEXT:    retq
   %m = icmp slt <16 x i32> %mask, zeroinitializer
   %a = and <16 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -925,7 +925,7 @@ define <8 x i64> @ternlog_maskz_xor_and_mask(<8 x i64> %x, <8 x i64> %y, <8 x i6
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm3
 ; ALL-NEXT:    vpsraq $63, %zmm2, %zmm0
-; ALL-NEXT:    vpternlogq $96, %zmm1, %zmm3, %zmm0
+; ALL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 & (zmm3 ^ zmm1)
 ; ALL-NEXT:    retq
   %m = icmp slt <8 x i64> %mask, zeroinitializer
   %a = and <8 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index 536c667c7ec902..721ffbe1ceb797 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -502,7 +502,7 @@ define <16 x i64> @narrowExtractedVectorSelect_crash(<16 x i64> %arg, <16 x i16>
 ; X86-AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; X86-AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; X86-AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; X86-AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; X86-AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; X86-AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; X86-AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm1
 ; X86-AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
@@ -515,7 +515,7 @@ define <16 x i64> @narrowExtractedVectorSelect_crash(<16 x i64> %arg, <16 x i16>
 ; X64-AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; X64-AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; X64-AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; X64-AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; X64-AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; X64-AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm1
 ; X64-AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index aac5847061cbec..c2afa2b971d752 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -227,9 +227,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1]
-; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,8,11,8,13,8,15,9]
+; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
@@ -243,10 +243,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1]
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [1,8,11,8,13,8,15,9]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
@@ -259,10 +259,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16
 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2]
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vpbroadcastw 18(%rdi), %xmm1
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [1,15,1,14,1,12,11,10]
+; CHECK-NEXT:    vpermi2w (%rdi), %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -271,12 +270,11 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [9,7,9,6,9,4,3,2]
+; CHECK-NEXT:    vpbroadcastw 18(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,15,1,14,1,12,11,10]
+; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -288,11 +286,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2]
+; CHECK-NEXT:    vpbroadcastw 18(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [1,15,1,14,1,12,11,10]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
@@ -778,9 +776,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm3
-; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [23,22,20,22,28,20,11,17]
+; CHECK-NEXT:    vpbroadcastw 22(%rdi), %ymm3
+; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
@@ -795,10 +793,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [23,22,20,22,28,20,11,17]
+; CHECK-NEXT:    vpbroadcastw 22(%rdi), %ymm1
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1041,8 +1039,9 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i
 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm0
-; CHECK-NEXT:    vshufps $7, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[3,1],mem[0,0]
+; CHECK-NEXT:    vmovss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,1],xmm0[0,0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
@@ -1051,8 +1050,9 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vshufps $7, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[3,1],mem[0,0]
+; CHECK-NEXT:    vmovss (%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm3
+; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[3,1],xmm2[0,0]
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
@@ -1066,8 +1066,9 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %
 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
-; CHECK-NEXT:    vshufps $7, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[3,1],mem[0,0]
+; CHECK-NEXT:    vmovss (%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
+; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[3,1],xmm1[0,0]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -1081,12 +1082,11 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [5,0,0,3]
+; CHECK-NEXT:    vpbroadcastd 20(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,4,4,7]
+; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
@@ -1098,11 +1098,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %
 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [5,0,0,3]
+; CHECK-NEXT:    vpbroadcastd 20(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,4,4,7]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
@@ -1114,9 +1114,9 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
+; CHECK-NEXT:    vmovd 16(%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,7,7,0]
-; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    vpermi2d 12(%rdi){1to4}, %xmm2, %xmm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
@@ -1130,10 +1130,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %
 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
+; CHECK-NEXT:    vmovd 16(%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,7,7,0]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2d 12(%rdi){1to4}, %xmm2, %xmm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
@@ -1609,9 +1609,10 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32
 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [13,0,0,6]
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [5,8,8,14]
+; CHECK-NEXT:    vpbroadcastd 52(%rdi), %ymm0
+; CHECK-NEXT:    vpermt2d (%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1621,11 +1622,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [5,8,8,14]
+; CHECK-NEXT:    vpbroadcastd 52(%rdi), %ymm3
+; CHECK-NEXT:    vpermt2d (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1638,10 +1639,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [5,8,8,14]
+; CHECK-NEXT:    vpbroadcastd 52(%rdi), %ymm1
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vpermt2d (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1654,9 +1656,9 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [15,5,3,2,0,0,0,0]
-; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    vpbroadcastd 28(%rdi), %ymm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [7,13,11,10,0,0,0,0]
+; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
 ; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
@@ -1671,10 +1673,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [15,5,3,2,0,0,0,0]
+; CHECK-NEXT:    vpbroadcastd 28(%rdi), %ymm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,13,11,10,0,0,0,0]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -1853,8 +1855,9 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i
 define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
-; CHECK-NEXT:    vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1]
+; CHECK-NEXT:    vmovsd 24(%rdi), %xmm0 # xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, ptr %vp
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
@@ -1863,9 +1866,10 @@ define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
+; CHECK-NEXT:    vmovq 24(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovq 8(%rdi), %xmm3 # xmm3 = mem[0],zero
 ; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpunpckhqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[1]
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, ptr %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
@@ -1877,9 +1881,10 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %
 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
+; CHECK-NEXT:    vmovq 24(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovq 8(%rdi), %xmm2 # xmm2 = mem[0],zero
 ; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpunpckhqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[1]
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, ptr %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
@@ -1891,10 +1896,10 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64>
 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT:    vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
+; CHECK-NEXT:    vmovq 8(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovq 16(%rdi), %xmm3 # xmm3 = mem[0],zero
 ; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, ptr %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
@@ -1906,10 +1911,10 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %
 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm1
-; CHECK-NEXT:    vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
+; CHECK-NEXT:    vmovq 8(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovq 16(%rdi), %xmm2 # xmm2 = mem[0],zero
 ; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, ptr %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
@@ -2374,17 +2379,17 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,3,2,4]
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vmovq (%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,7,6,0]
+; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $15, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    vmovq (%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vpblendd $240, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2398,17 +2403,17 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,3,2,4]
+; CHECK-FAST-NEXT:    vmovq (%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,6,0]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $15, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    vmovq (%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vpblendd $240, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2422,17 +2427,18 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,5,5,1]
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,1,1,5]
+; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm3
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2446,17 +2452,18 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [3,5,5,1]
+; CHECK-FAST-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,1,1,5]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 8(%rdi), %xmm1
+; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2470,16 +2477,16 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64>
 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) {
 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [7,0,0,2]
-; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vpbroadcastq 56(%rdi), %ymm1
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [2,4,4,6]
+; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-FAST-PERLANE-NEXT:    vpalignr $8, 32(%rdi), %ymm0, %ymm0 # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3]
+; CHECK-FAST-PERLANE-NEXT:    vbroadcastsd 56(%rdi), %ymm0
+; CHECK-FAST-PERLANE-NEXT:    vunpcklpd (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,1,3]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
@@ -2488,17 +2495,17 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) {
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,0,0,2]
+; CHECK-FAST-NEXT:    vpbroadcastq 56(%rdi), %ymm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,4,4,6]
+; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpalignr $8, 32(%rdi), %ymm2, %ymm2 # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 56(%rdi), %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2512,16 +2519,17 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,0,0,2]
+; CHECK-FAST-NEXT:    vpbroadcastq 56(%rdi), %ymm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [2,4,4,6]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vpalignr $8, 32(%rdi), %ymm1, %ymm1 # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 56(%rdi), %ymm1
+; CHECK-FAST-PERLANE-NEXT:    vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2567,17 +2575,17 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,2,7,1]
+; CHECK-FAST-NEXT:    vpbroadcastq 56(%rdi), %ymm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,6,3,5]
+; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $192, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 56(%rdi), %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vpblendd $63, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1,2,3,4,5],ymm2[6,7]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2591,16 +2599,17 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,7,1]
+; CHECK-FAST-NEXT:    vpbroadcastq 56(%rdi), %ymm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,6,3,5]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $192, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 56(%rdi), %ymm1
+; CHECK-FAST-PERLANE-NEXT:    vpblendd $63, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2614,9 +2623,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64>
 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) {
 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [7,2,3,2]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    vpbroadcastq 56(%rdi), %xmm1
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [1,6,7,6]
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2625,11 +2634,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) {
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,2,3,2]
+; CHECK-NEXT:    vpbroadcastq 56(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,6,7,6]
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2641,10 +2650,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,2,3,2]
+; CHECK-NEXT:    vpbroadcastq 56(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,6,7,6]
 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2656,17 +2666,18 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,3,1,5]
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,7,5,1]
+; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm3
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm1, %ymm1, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2680,17 +2691,18 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [3,3,1,5]
+; CHECK-FAST-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,7,5,1]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
 ; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 8(%rdi), %xmm1
+; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %ymm0, %ymm0, %k1
 ; CHECK-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
@@ -2712,8 +2724,9 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovaps 32(%rdi), %xmm0
-; CHECK-FAST-PERLANE-NEXT:    vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 32(%rdi), %xmm0
+; CHECK-FAST-PERLANE-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm1
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
@@ -2732,10 +2745,10 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %xmm2
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 32(%rdi), %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm3
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} xmm0 {%k1} = xmm2[1],xmm3[1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
@@ -2756,10 +2769,10 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64>
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %xmm1
-; CHECK-FAST-PERLANE-NEXT:    vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
+; CHECK-FAST-PERLANE-NEXT:    vpbroadcastq 32(%rdi), %xmm1
+; CHECK-FAST-PERLANE-NEXT:    vpinsrq $1, 8(%rdi), %xmm0, %xmm2
 ; CHECK-FAST-PERLANE-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT:    vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm1[1],xmm2[1]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
@@ -2771,9 +2784,10 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64>
 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 48(%rdi), %xmm2
+; CHECK-NEXT:    vmovq 16(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovq 48(%rdi), %xmm3 # xmm3 = mem[0],zero
 ; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpunpcklqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
@@ -2785,9 +2799,10 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %
 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 48(%rdi), %xmm1
+; CHECK-NEXT:    vmovq 16(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovq 48(%rdi), %xmm2 # xmm2 = mem[0],zero
 ; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpunpcklqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
@@ -2946,9 +2961,9 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec
 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,6,0,1]
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    vbroadcastss 8(%rdi), %xmm1
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [6,2,4,5]
+; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@@ -2957,9 +2972,9 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,0,1]
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    vbroadcastss 8(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [6,2,4,5]
+; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm3
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
@@ -2974,11 +2989,11 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,6,0,1]
+; CHECK-NEXT:    vbroadcastss 8(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,2,4,5]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
@@ -2991,12 +3006,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,7,7,2]
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %xmm2
+; CHECK-NEXT:    vbroadcastss 24(%rdi), %xmm3
+; CHECK-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
@@ -3008,12 +3023,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,7,7,2]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %xmm1
+; CHECK-NEXT:    vbroadcastss 24(%rdi), %xmm2
+; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
@@ -3025,13 +3040,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,1,3,7]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 28(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [7,5,7,3]
+; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
@@ -3043,12 +3057,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,1,3,7]
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 28(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [7,5,7,3]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
@@ -3060,10 +3074,9 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4
 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,3,5,3]
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %xmm1
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [5,7,1,7]
+; CHECK-NEXT:    vpermi2ps (%rdi), %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3072,13 +3085,12 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [1,3,5,3]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [5,7,1,7]
+; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3090,12 +3102,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,3,5,3]
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %xmm2
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [5,7,1,7]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
@@ -3680,14 +3692,10 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm2 = [4,14,4,14]
-; CHECK-NEXT:    # xmm2 = mem[0,0]
-; CHECK-NEXT:    vmovaps 32(%rdi), %ymm3
-; CHECK-NEXT:    vpermt2ps (%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 48(%rdi), %xmm2
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vunpcklps 24(%rdi){1to4}, %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
@@ -3699,14 +3707,10 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm2 = [4,14,4,14]
-; CHECK-NEXT:    # xmm2 = mem[0,0]
-; CHECK-NEXT:    vmovaps 32(%rdi), %ymm1
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
-; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    vbroadcastss 48(%rdi), %xmm1
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vunpcklps 24(%rdi){1to4}, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
@@ -3718,9 +3722,10 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4
 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [3,3,15,9]
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9]
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %xmm0
+; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3730,12 +3735,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %xmm3
+; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3748,11 +3753,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9]
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9]
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %xmm1
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3835,8 +3841,8 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double>
 define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
-; CHECK-NEXT:    vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3]
+; CHECK-NEXT:    vmovsd 16(%rdi), %xmm0 # xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovhps 8(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, ptr %vp
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
@@ -3845,11 +3851,11 @@ define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm2
-; CHECK-NEXT:    vblendpd $1, 16(%rdi), %xmm2, %xmm2 # xmm2 = mem[0],xmm2[1]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vmovapd %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vmovsd 8(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovsd 16(%rdi), %xmm3 # xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, ptr %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
@@ -3861,11 +3867,11 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2
 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm1
-; CHECK-NEXT:    vblendpd $1, 16(%rdi), %xmm1, %xmm1 # xmm1 = mem[0],xmm1[1]
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vmovapd %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, ptr %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
@@ -3877,10 +3883,11 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp,
 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 16(%rdi), %xmm2
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
+; CHECK-NEXT:    vmovsd (%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovsd 16(%rdi), %xmm3 # xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, ptr %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
@@ -3892,10 +3899,11 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2
 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 16(%rdi), %xmm1
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
+; CHECK-NEXT:    vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, ptr %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
@@ -4382,8 +4390,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp,
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vbroadcastsd 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,0,6,2]
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,0,6,0]
 ; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
@@ -4392,10 +4400,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vpermpd $236, (%rdi), %ymm2 # ymm2 = mem[0,3,2,3]
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vblendpd $12, (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1],mem[2,3]
 ; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vshufpd $1, 32(%rdi){1to4}, %ymm2, %ymm0 {%k1}
+; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm2[3,0,2,0]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
@@ -4407,8 +4416,8 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vbroadcastsd 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,0,6,2]
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,0,6,0]
 ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
 ; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -4417,10 +4426,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp,
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vpermpd $236, (%rdi), %ymm1 # ymm1 = mem[0,3,2,3]
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vblendpd $12, (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1],mem[2,3]
 ; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-FAST-PERLANE-NEXT:    vshufpd $1, 32(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,2,0]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
@@ -4432,21 +4442,22 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp,
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,2,3,4]
-; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [5,6,7,0]
+; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-FAST-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-PERLANE-NEXT:    vperm2f128 $33, 32(%rdi), %ymm2, %ymm3 # ymm3 = ymm2[2,3],mem[0,1]
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %ymm3
+; CHECK-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[0,1]
 ; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm4, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm2[1],ymm3[0],ymm2[3],ymm3[2]
+; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm3[1],ymm2[0],ymm3[3],ymm2[2]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -4458,20 +4469,22 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [1,2,3,4]
-; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [5,6,7,0]
+; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %ymm1
-; CHECK-FAST-PERLANE-NEXT:    vperm2f128 $33, 32(%rdi), %ymm1, %ymm2 # ymm2 = ymm1[2,3],mem[0,1]
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vmovapd (%rdi), %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
 ; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
-; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1],ymm2[0],ymm1[3],ymm2[2]
+; CHECK-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm1[0],ymm2[3],ymm1[2]
 ; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -4481,26 +4494,43 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp,
 }
 
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) {
-; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [4,2,1,0]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,6,5,4]
+; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm1, %ymm0
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm0 # xmm0 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vpermpd $24, (%rdi), %ymm1 # ymm1 = mem[0,2,1,0]
+; CHECK-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [4,2,1,0]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,6,5,4]
+; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-FAST-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vpermpd $24, (%rdi), %ymm3 # ymm3 = mem[0,2,1,0]
+; CHECK-FAST-PERLANE-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3]
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm2, %ymm0 {%k1}
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4509,14 +4539,25 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4
 }
 
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,2,1,0]
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,6,5,4]
+; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-FAST-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vpermpd $24, (%rdi), %ymm2 # ymm2 = mem[0,2,1,0]
+; CHECK-FAST-PERLANE-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3]
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm1, %ymm0 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4559,14 +4600,25 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp,
 }
 
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-NEXT:    vperm2f128 $33, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[2,3],mem[0,1]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-NEXT:    vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovddup 40(%rdi), %xmm2 # xmm2 = mem[0,0]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,1,1,1]
+; CHECK-FAST-NEXT:    vpermi2pd 16(%rdi){1to4}, %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-FAST-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovddup 40(%rdi), %xmm2 # xmm2 = mem[0,0]
+; CHECK-FAST-PERLANE-NEXT:    vbroadcastsd 16(%rdi), %ymm3
+; CHECK-FAST-PERLANE-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,1]
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4575,14 +4627,25 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4
 }
 
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm1
-; CHECK-NEXT:    vperm2f128 $33, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[0,1]
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-NEXT:    vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovddup 40(%rdi), %xmm2 # xmm2 = mem[0,0]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [6,1,1,1]
+; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-FAST-NEXT:    vpermi2pd 16(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovddup 40(%rdi), %xmm1 # xmm1 = mem[0,0]
+; CHECK-FAST-PERLANE-NEXT:    vbroadcastsd 16(%rdi), %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,1]
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4591,26 +4654,47 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp,
 }
 
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) {
-; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 32(%rdi), %ymm1
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,2,4,1]
-; CHECK-NEXT:    vpermi2pd (%rdi), %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero
+; CHECK-FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [4,6,2,5]
+; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi), %ymm1, %ymm0
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-FAST-PERLANE-NEXT:    vpermpd $104, 32(%rdi), %ymm1 # ymm1 = mem[0,2,2,1]
+; CHECK-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,2,4,1]
-; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovsd (%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,6,2,5]
+; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-FAST-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovsd (%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; CHECK-FAST-PERLANE-NEXT:    vpermpd $104, 32(%rdi), %ymm3 # ymm3 = mem[0,2,2,1]
+; CHECK-FAST-PERLANE-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3]
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm2, %ymm0 {%k1}
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4619,15 +4703,27 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4
 }
 
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,1]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,6,2,5]
+; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
+; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; CHECK-FAST-PERLANE-NEXT:    vpermpd $104, 32(%rdi), %ymm2 # ymm2 = mem[0,2,2,1]
+; CHECK-FAST-PERLANE-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3]
+; CHECK-FAST-PERLANE-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-FAST-PERLANE-NEXT:    vmovapd %ymm1, %ymm0 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4668,8 +4764,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp,
 define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm0
-; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0]
+; CHECK-NEXT:    vmovsd 8(%rdi), %xmm0 # xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovhps 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
@@ -4678,10 +4774,11 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm2
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0]
+; CHECK-NEXT:    vmovsd 8(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vmovsd 48(%rdi), %xmm3 # xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} = xmm2[0],xmm3[0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
@@ -4693,10 +4790,11 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2
 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %xmm1
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0]
+; CHECK-NEXT:    vmovsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovsd 48(%rdi), %xmm2 # xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm0, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm1[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
@@ -4711,7 +4809,7 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2
 ; CHECK-NEXT:    vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
+; CHECK-NEXT:    vunpcklpd 32(%rdi){1to2}, %xmm2, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
@@ -4726,7 +4824,7 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp,
 ; CHECK-NEXT:    vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0]
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
+; CHECK-NEXT:    vunpcklpd 32(%rdi){1to2}, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll
index b7b1212e767222..0df466cdab5bbc 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll
@@ -826,7 +826,8 @@ define <2 x double> @test_2xdouble_zero_masked_shuff_mask1(<2 x double> %vec1, <
 define <2 x double> @test_2xdouble_shuff_mem_mask0(<2 x double> %vec1, ptr %vec2p) {
 ; CHECK-LABEL: test_2xdouble_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
@@ -835,9 +836,10 @@ define <2 x double> @test_2xdouble_shuff_mem_mask0(<2 x double> %vec1, ptr %vec2
 define <2 x double> @test_2xdouble_masked_shuff_mem_mask0(<2 x double> %vec1, ptr %vec2p, <2 x double> %vec3, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],xmm3[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
@@ -850,9 +852,10 @@ define <2 x double> @test_2xdouble_masked_shuff_mem_mask0(<2 x double> %vec1, pt
 define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask0(<2 x double> %vec1, ptr %vec2p, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
@@ -864,9 +867,10 @@ define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask0(<2 x double> %vec
 define <2 x double> @test_2xdouble_masked_shuff_mem_mask1(<2 x double> %vec1, ptr %vec2p, <2 x double> %vec3, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],xmm3[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
@@ -879,9 +883,10 @@ define <2 x double> @test_2xdouble_masked_shuff_mem_mask1(<2 x double> %vec1, pt
 define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask1(<2 x double> %vec1, ptr %vec2p, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll b/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll
index d0b183dfeae6eb..73e2c7e564c759 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll
@@ -826,7 +826,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %ve
 define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, ptr %vec2p) {
 ; CHECK-LABEL: test_2xdouble_unpack_low_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -835,9 +835,10 @@ define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, ptr
 define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, ptr %vec2p, <2 x double> %vec3, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
@@ -850,9 +851,10 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec
 define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, ptr %vec2p, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -864,9 +866,10 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double>
 define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, ptr %vec2p, <2 x double> %vec3, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
@@ -879,9 +882,10 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec
 define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, ptr %vec2p, <2 x double> %mask) {
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -2223,7 +2227,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %v
 define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, ptr %vec2p) {
 ; CHECK-LABEL: test_2xdouble_unpack_high_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -2234,7 +2238,7 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %ve
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
@@ -2249,7 +2253,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -2263,7 +2267,7 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %ve
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
@@ -2278,7 +2282,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, ptr %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
index f4eb5b952ae436..f93a1fff51f390 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
@@ -1938,7 +1938,8 @@ define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2,
 ; X64-NEXT:    vmovw (%rsi), %xmm0
 ; X64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; X64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; X64-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; X64-NEXT:    retq
 ;
@@ -1948,7 +1949,8 @@ define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2,
 ; X86-NEXT:    vmovw (%eax), %xmm0
 ; X86-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; X86-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; X86-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %6 = load i8, ptr %4, align 1
@@ -2111,7 +2113,7 @@ define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width
 ; X86-NEXT:    andl $-32, %esp
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; X86-NEXT:    vpaddd 8(%ebp), %ymm1, %ymm1
+; X86-NEXT:    vpaddd 36(%ebp){1to8}, %ymm1, %ymm1
 ; X86-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
 ; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
 ; X86-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index d92e1a1e7b9d49..be02a6071ab582 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -467,8 +467,9 @@ define i8 @test_bitreverse_i8(i8 %a) {
 ;
 ; X86XOP-LABEL: test_bitreverse_i8:
 ; X86XOP:       # %bb.0:
-; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm0
 ; X86XOP-NEXT:    vmovd %xmm0, %eax
 ; X86XOP-NEXT:    # kill: def $al killed $al killed $eax
 ; X86XOP-NEXT:    retl
@@ -533,8 +534,9 @@ define i4 @test_bitreverse_i4(i4 %a) {
 ;
 ; X86XOP-LABEL: test_bitreverse_i4:
 ; X86XOP:       # %bb.0:
-; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm0
 ; X86XOP-NEXT:    vmovd %xmm0, %eax
 ; X86XOP-NEXT:    shrb $4, %al
 ; X86XOP-NEXT:    # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index 4b0e5441b4abf1..dd62183904c884 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -77,19 +77,15 @@ entry:
 ; FIXME: This could be 'movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]'.
 
 define <2 x double> @test_negative_zero_2(<2 x double> %A) {
-; SSE2-LABEL: test_negative_zero_2:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_negative_zero_2:
-; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_negative_zero_2:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_negative_zero_2:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX-NEXT:    retq
 entry:
   %0 = extractelement <2 x double> %A, i32 0
diff --git a/llvm/test/CodeGen/X86/combine-fabs.ll b/llvm/test/CodeGen/X86/combine-fabs.ll
index 7aa6628cb7f391..1f2c0ee0c83d9d 100644
--- a/llvm/test/CodeGen/X86/combine-fabs.ll
+++ b/llvm/test/CodeGen/X86/combine-fabs.ll
@@ -40,7 +40,8 @@ define <4 x float> @combine_vec_fabs_constant() {
 define float @combine_fabs_fabs(float %a) {
 ; SSE-LABEL: combine_fabs_fabs:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE-NEXT:    andps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_fabs_fabs:
@@ -73,7 +74,8 @@ define <4 x float> @combine_vec_fabs_fabs(<4 x float> %a) {
 define float @combine_fabs_fneg(float %a) {
 ; SSE-LABEL: combine_fabs_fneg:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE-NEXT:    andps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_fabs_fneg:
@@ -106,7 +108,8 @@ define <4 x float> @combine_vec_fabs_fneg(<4 x float> %a) {
 define float @combine_fabs_fcopysign(float %a, float %b) {
 ; SSE-LABEL: combine_fabs_fcopysign:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE-NEXT:    andps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_fabs_fcopysign:
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 42f09d04da26ed..8a502eebf0d4ea 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -1609,10 +1609,12 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
 ;
 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; XOP-NEXT:    vpinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm1
+; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm1
 ; XOP-NEXT:    vpsrlq $62, %xmm1, %xmm1
 ; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
-; XOP-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vpinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm2
+; XOP-NEXT:    vpshaq %xmm2, %xmm1, %xmm1
 ; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; XOP-NEXT:    retq
   %1 = sdiv <2 x i64> %x, <i64 1, i64 4>
@@ -1739,7 +1741,8 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
 ; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm2
 ; XOP-NEXT:    vpsrlq $62, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
-; XOP-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; XOP-NEXT:    vpinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm3
+; XOP-NEXT:    vpshaq %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-NEXT:    vpshaq %xmm1, %xmm3, %xmm1
 ; XOP-NEXT:    vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -3051,7 +3054,8 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; XOP-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15]
 ; XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpinsrb $15, {{\.?LCPI[0-9]+_[0-9]+}}+15(%rip), %xmm0, %xmm2
+; XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
 ; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index 55715197830b14..58675f51565534 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -631,8 +631,12 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    psrlw $15, %xmm0
-; SSE2-NEXT:    pandn %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    psrlw $7, %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm3 = [1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm3, %xmm1
 ; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/commute-blend-avx2.ll b/llvm/test/CodeGen/X86/commute-blend-avx2.ll
index 75511104580e90..2ae06f1ab43c0b 100644
--- a/llvm/test/CodeGen/X86/commute-blend-avx2.ll
+++ b/llvm/test/CodeGen/X86/commute-blend-avx2.ll
@@ -70,7 +70,8 @@ declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nou
 define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, ptr %b) #0 {
 ; CHECK-LABEL: commute_fold_vblendpd_128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; CHECK-NEXT:    retq
   %1 = load <2 x double>, ptr %b
   %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
@@ -81,7 +82,8 @@ declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nou
 define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, ptr %b) #0 {
 ; CHECK-LABEL: commute_fold_vblendpd_256:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; CHECK-NEXT:    vbroadcastsd 24(%rdi), %ymm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; CHECK-NEXT:    retq
   %1 = load <4 x double>, ptr %b
   %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7)
diff --git a/llvm/test/CodeGen/X86/commute-blend-sse41.ll b/llvm/test/CodeGen/X86/commute-blend-sse41.ll
index 07d6a8ba22bb13..aa6a5367558429 100644
--- a/llvm/test/CodeGen/X86/commute-blend-sse41.ll
+++ b/llvm/test/CodeGen/X86/commute-blend-sse41.ll
@@ -26,7 +26,7 @@ declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwi
 define <2 x double> @commute_fold_blendpd(<2 x double> %a, ptr %b) {
 ; CHECK-LABEL: commute_fold_blendpd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; CHECK-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; CHECK-NEXT:    retq
   %1 = load <2 x double>, ptr %b
   %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
@@ -54,11 +54,11 @@ define <4 x i32> @commute_fold_blend_v4i32(ptr %a, <4 x i32> %b) {
 define void @baz(ptr %arg, ptr %arg1) optsize {
 ; CHECK-LABEL: baz:
 ; CHECK:       # %bb.0: # %bb
-; CHECK-NEXT:    movaps (%rdi), %xmm0
-; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [3,3]
-; CHECK-NEXT:    andps %xmm0, %xmm1
-; CHECK-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
-; CHECK-NEXT:    movups %xmm1, (%rsi)
+; CHECK-NEXT:    movdqa (%rdi), %xmm0
+; CHECK-NEXT:    pinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm1
+; CHECK-NEXT:    pand %xmm0, %xmm1
+; CHECK-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT:    movdqu %xmm1, (%rsi)
 ; CHECK-NEXT:    retq
 bb:
   %tmp = load <2 x i64>, ptr %arg, align 16
diff --git a/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll b/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll
index 0052359eedb50a..0d25e85e2042f1 100644
--- a/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll
+++ b/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll
@@ -13,7 +13,8 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define double @mag_pos0_double(double %x) nounwind {
 ; CHECK-LABEL: mag_pos0_double:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call double @copysign(double 0.0, double %x)
   ret double %y
@@ -25,7 +26,8 @@ define double @mag_pos0_double(double %x) nounwind {
 define double @mag_neg0_double(double %x) nounwind {
 ; CHECK-LABEL: mag_neg0_double:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call double @copysign(double -0.0, double %x)
   ret double %y
@@ -41,7 +43,8 @@ define double @mag_pos1_double(double %x) nounwind {
 ; CHECK-LABEL: mag_pos1_double:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
+; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call double @copysign(double 1.0, double %x)
   ret double %y
@@ -58,7 +61,8 @@ define double @mag_neg1_double(double %x) nounwind {
 ; CHECK-LABEL: mag_neg1_double:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
+; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call double @copysign(double -1.0, double %x)
   ret double %y
@@ -73,7 +77,8 @@ define double @mag_neg1_double(double %x) nounwind {
 define float @mag_pos0_float(float %x) nounwind {
 ; CHECK-LABEL: mag_pos0_float:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call float @copysignf(float 0.0, float %x)
   ret float %y
@@ -85,7 +90,8 @@ define float @mag_pos0_float(float %x) nounwind {
 define float @mag_neg0_float(float %x) nounwind {
 ; CHECK-LABEL: mag_neg0_float:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call float @copysignf(float -0.0, float %x)
   ret float %y
@@ -103,7 +109,8 @@ define float @mag_pos1_float(float %x) nounwind {
 ; CHECK-LABEL: mag_pos1_float:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call float @copysignf(float 1.0, float %x)
   ret float %y
@@ -124,7 +131,8 @@ define float @mag_neg1_float(float %x) nounwind {
 ; CHECK-LABEL: mag_neg1_float:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = call float @copysignf(float -1.0, float %x)
   ret float %y
diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll
index f12693469a3f6e..dba07d80c6cd66 100644
--- a/llvm/test/CodeGen/X86/extract-concat.ll
+++ b/llvm/test/CodeGen/X86/extract-concat.ll
@@ -153,14 +153,16 @@ define <16 x i64> @load_catcat(ptr %p) {
 define <4 x i32> @cat_ext_straddle(ptr %px, ptr %py) {
 ; SSE-LABEL: cat_ext_straddle:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps 16(%rdi), %xmm0
-; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: cat_ext_straddle:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm0
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX-NEXT:    retq
   %x = load <6 x i32>, ptr %px
   %y = load <6 x i32>, ptr %py
diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll
index 944f6bbfd0bfbe..dfc81b3c3fb915 100644
--- a/llvm/test/CodeGen/X86/extractelement-fp.ll
+++ b/llvm/test/CodeGen/X86/extractelement-fp.ll
@@ -353,9 +353,9 @@ define float @select_fcmp_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z,
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    vmovaps 8(%ebp), %xmm3
 ; X86-NEXT:    vcmpneq_oqss %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vblendvps %xmm0, %xmm2, %xmm3, %xmm0
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ebp, %esp
@@ -546,7 +546,8 @@ define float @fabs_v4f32(<4 x float> %x) nounwind {
 define double @fabs_v4f64(<4 x double> %x) nounwind {
 ; X64-LABEL: fabs_v4f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vmovsd {{.*#+}} xmm1 = [NaN,0.0E+0]
+; X64-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index 022b25a2415333..e7758acf126d63 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -13,10 +13,16 @@ define i32 @t(ptr %val) nounwind  {
 ; X86-SSE2-NEXT:    movl 8(%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-LABEL: t:
-; X64:       # %bb.0:
-; X64-NEXT:    movl 8(%rdi), %eax
-; X64-NEXT:    retq
+; X64-SSSE3-LABEL: t:
+; X64-SSSE3:       # %bb.0:
+; X64-SSSE3-NEXT:    movl 8(%rdi), %eax
+; X64-SSSE3-NEXT:    retq
+;
+; X64-AVX-LABEL: t:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX-NEXT:    vextractps $2, %xmm0, %eax
+; X64-AVX-NEXT:    retq
   %tmp2 = load <2 x i64>, ptr %val, align 16		; <<2 x i64>> [#uses=1]
   %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32>		; <<4 x i32>> [#uses=1]
   %tmp4 = extractelement <4 x i32> %tmp3, i32 2		; <i32> [#uses=1]
@@ -76,9 +82,11 @@ bb:
 define i64 @t4(ptr %a) {
 ; X86-SSE2-LABEL: t4:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl (%ecx), %eax
-; X86-SSE2-NEXT:    movl 4(%ecx), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
 ; X86-SSE2-NEXT:    retl
 ;
 ; X64-LABEL: t4:
@@ -126,8 +134,7 @@ define float @t6(ptr%a0) {
 ; X86-SSE2-NEXT:    pushl %eax
 ; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movaps (%eax), %xmm0
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE2-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE2-NEXT:    cmpeqss %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
@@ -142,7 +149,7 @@ define float @t6(ptr%a0) {
 ;
 ; X64-SSSE3-LABEL: t6:
 ; X64-SSSE3:       # %bb.0:
-; X64-SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
+; X64-SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-SSSE3-NEXT:    xorps %xmm0, %xmm0
 ; X64-SSSE3-NEXT:    cmpeqss %xmm1, %xmm0
 ; X64-SSSE3-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
@@ -226,8 +233,7 @@ define float @PR43971_1(ptr%a0) nounwind {
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    pushl %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movaps (%eax), %xmm0
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE2-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE2-NEXT:    cmpeqss %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
@@ -241,7 +247,7 @@ define float @PR43971_1(ptr%a0) nounwind {
 ;
 ; X64-SSSE3-LABEL: PR43971_1:
 ; X64-SSSE3:       # %bb.0: # %entry
-; X64-SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
+; X64-SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-SSSE3-NEXT:    xorps %xmm0, %xmm0
 ; X64-SSSE3-NEXT:    cmpeqss %xmm1, %xmm0
 ; X64-SSSE3-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
@@ -317,12 +323,27 @@ define void @subextract_broadcast_load_constant(ptr nocapture %0, ptr nocapture
 ; X86-SSE2-NEXT:    movw $-24160, (%eax) # imm = 0xA1A0
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-LABEL: subextract_broadcast_load_constant:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $-1583308898, (%rdi) # imm = 0xA1A09F9E
-; X64-NEXT:    movw $-24674, (%rsi) # imm = 0x9F9E
-; X64-NEXT:    movw $-24160, (%rdx) # imm = 0xA1A0
-; X64-NEXT:    retq
+; X64-SSSE3-LABEL: subextract_broadcast_load_constant:
+; X64-SSSE3:       # %bb.0:
+; X64-SSSE3-NEXT:    movl $-1583308898, (%rdi) # imm = 0xA1A09F9E
+; X64-SSSE3-NEXT:    movw $-24674, (%rsi) # imm = 0x9F9E
+; X64-SSSE3-NEXT:    movw $-24160, (%rdx) # imm = 0xA1A0
+; X64-SSSE3-NEXT:    retq
+;
+; X64-AVX1-LABEL: subextract_broadcast_load_constant:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    movl $-1583308898, (%rdi) # imm = 0xA1A09F9E
+; X64-AVX1-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; X64-AVX1-NEXT:    movw %ax, (%rsi)
+; X64-AVX1-NEXT:    movw $-24160, (%rdx) # imm = 0xA1A0
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: subextract_broadcast_load_constant:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl $-1583308898, (%rdi) # imm = 0xA1A09F9E
+; X64-AVX2-NEXT:    movw $-24674, (%rsi) # imm = 0x9F9E
+; X64-AVX2-NEXT:    movw $-24160, (%rdx) # imm = 0xA1A0
+; X64-AVX2-NEXT:    retq
   store i8 -98, ptr %0, align 1
   %4 = getelementptr inbounds i8, ptr %0, i64 1
   store i8 -97, ptr %4, align 1
diff --git a/llvm/test/CodeGen/X86/fabs.ll b/llvm/test/CodeGen/X86/fabs.ll
index 82c82ac3e917e3..d553cb7516fab4 100644
--- a/llvm/test/CodeGen/X86/fabs.ll
+++ b/llvm/test/CodeGen/X86/fabs.ll
@@ -21,7 +21,8 @@ define float @test1(float %X) {
 ;
 ; X64-LABEL: test1:
 ; X64:       # %bb.0:
-; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    andps %xmm1, %xmm0
 ; X64-NEXT:    retq
   %Y = call float @fabsf(float %X) readnone
   ret float %Y
diff --git a/llvm/test/CodeGen/X86/fast-isel-fneg.ll b/llvm/test/CodeGen/X86/fast-isel-fneg.ll
index 128f5ee0c318bc..240da2c8478492 100644
--- a/llvm/test/CodeGen/X86/fast-isel-fneg.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-fneg.ll
@@ -40,8 +40,9 @@ define float @fneg_f32(float %x) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushl %eax
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; SSE2-NEXT:    movss %xmm0, (%esp)
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT:    xorps %xmm0, %xmm1
+; SSE2-NEXT:    movss %xmm1, (%esp)
 ; SSE2-NEXT:    flds (%esp)
 ; SSE2-NEXT:    popl %eax
 ; SSE2-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/fma-signed-zero.ll b/llvm/test/CodeGen/X86/fma-signed-zero.ll
index f9e4e9929c6c4a..080469bd7d6da8 100644
--- a/llvm/test/CodeGen/X86/fma-signed-zero.ll
+++ b/llvm/test/CodeGen/X86/fma-signed-zero.ll
@@ -10,7 +10,8 @@ define float @fneg_fma32(float %x, float %y, float %z) {
 ; CHECK-LABEL: fneg_fma32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
-; CHECK-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %negx = fneg float %x
   %negz = fneg float %z
@@ -37,7 +38,8 @@ define double @fneg_fma64(double %x, double %y, double %z) {
 ; CHECK-LABEL: fneg_fma64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
-; CHECK-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; CHECK-NEXT:    vxorpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %negx = fneg double %x
   %negz = fneg double %z
diff --git a/llvm/test/CodeGen/X86/fp-fold.ll b/llvm/test/CodeGen/X86/fp-fold.ll
index 74b5232a4df62d..93716a48542ea9 100644
--- a/llvm/test/CodeGen/X86/fp-fold.ll
+++ b/llvm/test/CodeGen/X86/fp-fold.ll
@@ -31,7 +31,7 @@ define float @fadd_produce_zero(float %x) {
 define float @fadd_reassociate(float %x) {
 ; CHECK-LABEL: fadd_reassociate:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %sum = fadd float %x, 8.0
   %r = fadd reassoc nsz float %sum, 12.0
@@ -85,7 +85,7 @@ define float @fsub_neg_x_y(float %x, float %y) {
 define float @fsub_neg_y(float %x, float %y) {
 ; CHECK-LABEL: fsub_neg_y:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %mul = fmul float %x, 5.0
   %add = fadd float %mul, %y
@@ -96,7 +96,7 @@ define float @fsub_neg_y(float %x, float %y) {
 define <4 x float> @fsub_neg_y_vector(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: fsub_neg_y_vector:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %mul = fmul <4 x float> %x, <float 5.0, float 5.0, float 5.0, float 5.0>
   %add = fadd <4 x float> %mul, %y
@@ -107,7 +107,7 @@ define <4 x float> @fsub_neg_y_vector(<4 x float> %x, <4 x float> %y) {
 define <4 x float> @fsub_neg_y_vector_nonuniform(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: fsub_neg_y_vector_nonuniform:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %mul = fmul <4 x float> %x, <float 5.0, float 6.0, float 7.0, float 8.0>
   %add = fadd <4 x float> %mul, %y
@@ -118,7 +118,7 @@ define <4 x float> @fsub_neg_y_vector_nonuniform(<4 x float> %x, <4 x float> %y)
 define float @fsub_neg_y_commute(float %x, float %y) {
 ; CHECK-LABEL: fsub_neg_y_commute:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %mul = fmul float %x, 5.0
   %add = fadd float %y, %mul
@@ -129,7 +129,7 @@ define float @fsub_neg_y_commute(float %x, float %y) {
 define <4 x float> @fsub_neg_y_commute_vector(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: fsub_neg_y_commute_vector:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mulps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %mul = fmul <4 x float> %x, <float 5.0, float 5.0, float 5.0, float 5.0>
   %add = fadd <4 x float> %y, %mul
@@ -142,7 +142,8 @@ define <4 x float> @fsub_neg_y_commute_vector(<4 x float> %x, <4 x float> %y) {
 define float @fsub_fadd_common_op_fneg(float %x, float %y) {
 ; CHECK-LABEL: fsub_fadd_common_op_fneg:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %a = fadd float %x, %y
   %r = fsub reassoc nsz float %y, %a
@@ -154,7 +155,7 @@ define float @fsub_fadd_common_op_fneg(float %x, float %y) {
 define <4 x float> @fsub_fadd_common_op_fneg_vec(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: fsub_fadd_common_op_fneg_vec:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %a = fadd <4 x float> %x, %y
   %r = fsub nsz reassoc <4 x float> %y, %a
@@ -167,7 +168,8 @@ define <4 x float> @fsub_fadd_common_op_fneg_vec(<4 x float> %x, <4 x float> %y)
 define float @fsub_fadd_common_op_fneg_commute(float %x, float %y) {
 ; CHECK-LABEL: fsub_fadd_common_op_fneg_commute:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %a = fadd float %y, %x
   %r = fsub reassoc nsz float %y, %a
@@ -179,7 +181,7 @@ define float @fsub_fadd_common_op_fneg_commute(float %x, float %y) {
 define <4 x float> @fsub_fadd_common_op_fneg_commute_vec(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: fsub_fadd_common_op_fneg_commute_vec:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %a = fadd <4 x float> %y, %x
   %r = fsub reassoc nsz <4 x float> %y, %a
@@ -233,7 +235,8 @@ define float @fsub_zero_nsz_1(float %x) {
 define float @fsub_zero_nsz_2(float %x) {
 ; CHECK-LABEL: fsub_zero_nsz_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %r = fsub nsz float 0.0, %x
   ret float %r
@@ -259,7 +262,7 @@ define float @fmul_one(float %x) {
 define float @fmul_x_const_const(float %x) {
 ; CHECK-LABEL: fmul_x_const_const:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %mul = fmul reassoc float %x, 9.0
   %r = fmul reassoc float %mul, 4.0
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
index 71d49481ebb8e7..5bd48d80354fc6 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
@@ -9,7 +9,8 @@ define float @f1(float %0, float %1, float %2) #0 {
 ; NOFMA:       # %bb.0: # %entry
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
-; NOFMA-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; NOFMA-NEXT:    movss {{.*#+}} xmm3 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; NOFMA-NEXT:    xorps %xmm3, %xmm0
 ; NOFMA-NEXT:    callq fmaf at PLT
 ; NOFMA-NEXT:    popq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 8
@@ -37,7 +38,8 @@ define double @f2(double %0, double %1, double %2) #0 {
 ; NOFMA:       # %bb.0: # %entry
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
-; NOFMA-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; NOFMA-NEXT:    movsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; NOFMA-NEXT:    xorps %xmm3, %xmm0
 ; NOFMA-NEXT:    callq fma at PLT
 ; NOFMA-NEXT:    popq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 8
@@ -65,7 +67,8 @@ define float @f3(float %0, float %1, float %2) #0 {
 ; NOFMA:       # %bb.0: # %entry
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
-; NOFMA-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; NOFMA-NEXT:    movss {{.*#+}} xmm3 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; NOFMA-NEXT:    xorps %xmm3, %xmm2
 ; NOFMA-NEXT:    callq fmaf at PLT
 ; NOFMA-NEXT:    popq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 8
@@ -93,7 +96,8 @@ define double @f4(double %0, double %1, double %2) #0 {
 ; NOFMA:       # %bb.0: # %entry
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
-; NOFMA-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; NOFMA-NEXT:    movsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0]
+; NOFMA-NEXT:    xorps %xmm3, %xmm2
 ; NOFMA-NEXT:    callq fma at PLT
 ; NOFMA-NEXT:    popq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 8
@@ -184,7 +188,8 @@ define float @f7(float %0, float %1, float %2) #0 {
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
 ; NOFMA-NEXT:    callq fmaf at PLT
-; NOFMA-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; NOFMA-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; NOFMA-NEXT:    xorps %xmm1, %xmm0
 ; NOFMA-NEXT:    popq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 8
 ; NOFMA-NEXT:    retq
@@ -192,13 +197,15 @@ define float @f7(float %0, float %1, float %2) #0 {
 ; FMA-AVX1-LABEL: f7:
 ; FMA-AVX1:       # %bb.0: # %entry
 ; FMA-AVX1-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
-; FMA-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; FMA-AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; FMA-AVX1-NEXT:    retq
 ;
 ; FMA4-LABEL: f7:
 ; FMA4:       # %bb.0: # %entry
 ; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
-; FMA4-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; FMA4-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; FMA4-NEXT:    retq
 ;
 ; FMA-AVX512-LABEL: f7:
@@ -221,7 +228,8 @@ define double @f8(double %0, double %1, double %2) #0 {
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
 ; NOFMA-NEXT:    callq fma at PLT
-; NOFMA-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; NOFMA-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; NOFMA-NEXT:    xorps %xmm1, %xmm0
 ; NOFMA-NEXT:    popq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 8
 ; NOFMA-NEXT:    retq
@@ -229,13 +237,15 @@ define double @f8(double %0, double %1, double %2) #0 {
 ; FMA-LABEL: f8:
 ; FMA:       # %bb.0: # %entry
 ; FMA-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
-; FMA-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA-NEXT:    vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; FMA-NEXT:    vxorpd %xmm1, %xmm0, %xmm0
 ; FMA-NEXT:    retq
 ;
 ; FMA4-LABEL: f8:
 ; FMA4:       # %bb.0: # %entry
 ; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
-; FMA4-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; FMA4-NEXT:    vxorpd %xmm1, %xmm0, %xmm0
 ; FMA4-NEXT:    retq
 entry:
   %3 = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2,
@@ -262,13 +272,15 @@ define float @f9(float %0, float %1, float %2) #0 {
 ; FMA-AVX1-LABEL: f9:
 ; FMA-AVX1:       # %bb.0: # %entry
 ; FMA-AVX1-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
-; FMA-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; FMA-AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; FMA-AVX1-NEXT:    retq
 ;
 ; FMA4-LABEL: f9:
 ; FMA4:       # %bb.0: # %entry
 ; FMA4-NEXT:    vfnmsubss {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
-; FMA4-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; FMA4-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; FMA4-NEXT:    retq
 ;
 ; FMA-AVX512-LABEL: f9:
@@ -304,13 +316,15 @@ define double @f10(double %0, double %1, double %2) #0 {
 ; FMA-LABEL: f10:
 ; FMA:       # %bb.0: # %entry
 ; FMA-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
-; FMA-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA-NEXT:    vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; FMA-NEXT:    vxorpd %xmm1, %xmm0, %xmm0
 ; FMA-NEXT:    retq
 ;
 ; FMA4-LABEL: f10:
 ; FMA4:       # %bb.0: # %entry
 ; FMA4-NEXT:    vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
-; FMA4-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; FMA4-NEXT:    vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; FMA4-NEXT:    vxorpd %xmm1, %xmm0, %xmm0
 ; FMA4-NEXT:    retq
 entry:
   %3 = fneg double %0
diff --git a/llvm/test/CodeGen/X86/fp-logic.ll b/llvm/test/CodeGen/X86/fp-logic.ll
index 522a1589caf09f..a21204cb307717 100644
--- a/llvm/test/CodeGen/X86/fp-logic.ll
+++ b/llvm/test/CodeGen/X86/fp-logic.ll
@@ -243,7 +243,8 @@ define float @movmsk(float %x) {
 define double @bitcast_fabs(double %x) {
 ; CHECK-LABEL: bitcast_fabs:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [NaN,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast double %x to i64
   %and = and i64 %bc1, 9223372036854775807
@@ -254,7 +255,8 @@ define double @bitcast_fabs(double %x) {
 define float @bitcast_fneg(float %x) {
 ; CHECK-LABEL: bitcast_fneg:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast float %x to i32
   %xor = xor i32 %bc1, 2147483648
@@ -311,7 +313,8 @@ define float @fsub_bitcast_fneg(float %x, float %y) {
 define float @nabsf(float %a) {
 ; CHECK-LABEL: nabsf:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %conv = bitcast float %a to i32
   %and = or i32 %conv, -2147483648
@@ -322,7 +325,8 @@ define float @nabsf(float %a) {
 define double @nabsd(double %a) {
 ; CHECK-LABEL: nabsd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %conv = bitcast double %a to i64
   %and = or i64 %conv, -9223372036854775808
diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll
index 8efd5819a6d22b..2ea83598219a31 100644
--- a/llvm/test/CodeGen/X86/fp-round.ll
+++ b/llvm/test/CodeGen/X86/fp-round.ll
@@ -24,10 +24,11 @@ define half @round_f16(half %h) {
 ; SSE41-NEXT:    callq __extendhfsf2 at PLT
 ; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; SSE41-NEXT:    andps %xmm0, %xmm1
-; SSE41-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    addss %xmm0, %xmm1
+; SSE41-NEXT:    movss {{.*#+}} xmm2 = [4.9999997E-1,0.0E+0,0.0E+0,0.0E+0]
+; SSE41-NEXT:    orps %xmm1, %xmm2
+; SSE41-NEXT:    addss %xmm0, %xmm2
 ; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    roundss $11, %xmm1, %xmm0
+; SSE41-NEXT:    roundss $11, %xmm2, %xmm0
 ; SSE41-NEXT:    callq __truncsfhf2 at PLT
 ; SSE41-NEXT:    popq %rax
 ; SSE41-NEXT:    .cfi_def_cfa_offset 8
@@ -55,7 +56,7 @@ define half @round_f16(half %h) {
 ; AVX512F-NEXT:    vmovd %eax, %xmm0
 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -67,7 +68,7 @@ define half @round_f16(half %h) {
 ; AVX512FP16:       ## %bb.0: ## %entry
 ; AVX512FP16-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX512FP16-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1]
-; AVX512FP16-NEXT:    vpternlogq $248, %xmm1, %xmm0, %xmm2
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm2 = xmm2 | (xmm0 & xmm1)
 ; AVX512FP16-NEXT:    vaddsh %xmm2, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vrndscalesh $11, %xmm0, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -85,10 +86,11 @@ define float @round_f32(float %x) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; SSE41-NEXT:    andps %xmm0, %xmm1
-; SSE41-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    addss %xmm0, %xmm1
+; SSE41-NEXT:    movss {{.*#+}} xmm2 = [4.9999997E-1,0.0E+0,0.0E+0,0.0E+0]
+; SSE41-NEXT:    orps %xmm1, %xmm2
+; SSE41-NEXT:    addss %xmm0, %xmm2
 ; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    roundss $11, %xmm1, %xmm0
+; SSE41-NEXT:    roundss $11, %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: round_f32:
@@ -103,7 +105,7 @@ define float @round_f32(float %x) {
 ; AVX512F-LABEL: round_f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -111,7 +113,7 @@ define float @round_f32(float %x) {
 ; AVX512FP16-LABEL: round_f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512FP16-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -128,10 +130,11 @@ define double @round_f64(double %x) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
 ; SSE41-NEXT:    andpd %xmm0, %xmm1
-; SSE41-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    addsd %xmm0, %xmm1
+; SSE41-NEXT:    movsd {{.*#+}} xmm2 = [4.9999999999999994E-1,0.0E+0]
+; SSE41-NEXT:    orpd %xmm1, %xmm2
+; SSE41-NEXT:    addsd %xmm0, %xmm2
 ; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    roundsd $11, %xmm1, %xmm0
+; SSE41-NEXT:    roundsd $11, %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: round_f64:
@@ -147,7 +150,7 @@ define double @round_f64(double %x) {
 ; AVX512F-LABEL: round_f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512F-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -155,7 +158,7 @@ define double @round_f64(double %x) {
 ; AVX512FP16-LABEL: round_f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512FP16-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -213,7 +216,7 @@ define <4 x float> @round_v4f32(<4 x float> %x) {
 ; AVX512F-LABEL: round_v4f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512F-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundps $11, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -221,7 +224,7 @@ define <4 x float> @round_v4f32(<4 x float> %x) {
 ; AVX512FP16-LABEL: round_v4f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512FP16-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundps $11, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -267,7 +270,7 @@ define <2 x double> @round_v2f64(<2 x double> %x) {
 ; AVX512F-LABEL: round_v2f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512F-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundpd $11, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -275,7 +278,7 @@ define <2 x double> @round_v2f64(<2 x double> %x) {
 ; AVX512FP16-LABEL: round_v2f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
 ; AVX512FP16-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundpd $11, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -361,7 +364,7 @@ define <8 x float> @round_v8f32(<8 x float> %x) {
 ; AVX512F-LABEL: round_v8f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
 ; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vroundps $11, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -369,7 +372,7 @@ define <8 x float> @round_v8f32(<8 x float> %x) {
 ; AVX512FP16-LABEL: round_v8f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
 ; AVX512FP16-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    vroundps $11, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    retq
@@ -431,7 +434,7 @@ define <4 x double> @round_v4f64(<4 x double> %x) {
 ; AVX512F-LABEL: round_v4f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
 ; AVX512F-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vroundpd $11, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -439,7 +442,7 @@ define <4 x double> @round_v4f64(<4 x double> %x) {
 ; AVX512FP16-LABEL: round_v4f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
 ; AVX512FP16-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    vroundpd $11, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    retq
@@ -587,7 +590,7 @@ define <16 x float> @round_v16f32(<16 x float> %x) {
 ; AVX512F-LABEL: round_v16f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
 ; AVX512F-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vrndscaleps $11, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -595,7 +598,7 @@ define <16 x float> @round_v16f32(<16 x float> %x) {
 ; AVX512FP16-LABEL: round_v16f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
 ; AVX512FP16-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    vrndscaleps $11, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    retq
@@ -695,7 +698,7 @@ define <8 x double> @round_v8f64(<8 x double> %x) {
 ; AVX512F-LABEL: round_v8f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
 ; AVX512F-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vrndscalepd $11, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -703,7 +706,7 @@ define <8 x double> @round_v8f64(<8 x double> %x) {
 ; AVX512FP16-LABEL: round_v8f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
 ; AVX512FP16-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    vrndscalepd $11, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll
index 1de2484d47ba1b..d16fdcdf1752da 100644
--- a/llvm/test/CodeGen/X86/fp128-cast.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast.ll
@@ -1260,7 +1260,8 @@ define fp128 @TestTruncCopysign(fp128 %x, i32 %n) nounwind {
 ; X64-SSE-NEXT:    pushq %rax
 ; X64-SSE-NEXT:    callq __trunctfdf2 at PLT
 ; X64-SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = [+Inf,0.0E+0]
+; X64-SSE-NEXT:    orps %xmm1, %xmm0
 ; X64-SSE-NEXT:    callq __extenddftf2 at PLT
 ; X64-SSE-NEXT:    addq $8, %rsp
 ; X64-SSE-NEXT:  .LBB26_2: # %cleanup
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index 3af8b1aec1feb2..5a1f0da86de633 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -335,7 +335,8 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_fabs:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; F16C-NEXT:    vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; F16C-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
@@ -352,7 +353,8 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movq %rdi, %rbx
 ; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movd {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    pand %xmm1, %xmm0
 ; X64-NEXT:    callq __truncsfhf2 at PLT
 ; X64-NEXT:    pextrw $0, %xmm0, %eax
 ; X64-NEXT:    movw %ax, (%rbx)
@@ -515,7 +517,8 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_fneg:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; F16C-NEXT:    vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; F16C-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
@@ -532,7 +535,8 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind {
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movq %rdi, %rbx
 ; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    pxor %xmm1, %xmm0
 ; X64-NEXT:    callq __truncsfhf2 at PLT
 ; X64-NEXT:    pextrw $0, %xmm0, %eax
 ; X64-NEXT:    movw %ax, (%rbx)
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 362b3b945f9622..57695316386b78 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -311,8 +311,8 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
-; X86-NEXT:    vpinsrd $0, %ecx, %xmm0, %xmm0
+; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    vpinsrd $1, {{\.?LCPI[0-9]+_[0-9]+}}+4, %xmm0, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
@@ -320,8 +320,8 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin
 ;
 ; X64-LABEL: freeze_buildvector_single_repeated_maybe_poison_operand:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42]
-; X64-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    vpinsrd $1, {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0
 ; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index c071f64dc66cdf..2d8484fb82faef 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -492,7 +492,7 @@ define <16 x i8> @splatconstant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpsllw $3, %xmm0, %xmm2
 ; GFNIAVX512-NEXT:    vpsrlw $5, %xmm1, %xmm0
-; GFNIAVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
+; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
 ; GFNIAVX512-NEXT:    retq
   %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
   ret <16 x i8> %res
@@ -518,7 +518,7 @@ define <16 x i8> @splatconstant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpaddw %xmm0, %xmm0, %xmm2
 ; GFNIAVX512-NEXT:    vpsrlw $7, %xmm1, %xmm0
-; GFNIAVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
+; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
 ; GFNIAVX512-NEXT:    retq
   %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>)
   ret <16 x i8> %res
@@ -1311,7 +1311,7 @@ define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; GFNIAVX512-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; GFNIAVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0
+; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
 ; GFNIAVX512-NEXT:    retq
   %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <32 x i8> %res
@@ -1349,7 +1349,7 @@ define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; GFNIAVX512-NEXT:    vpsrlw $6, %ymm1, %ymm0
-; GFNIAVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0
+; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
 ; GFNIAVX512-NEXT:    retq
   %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6>)
   ret <32 x i8> %res
@@ -2147,7 +2147,7 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou
 define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: splatvar_fshl_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; GFNISSE-NEXT:    movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm9
 ; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
 ; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
@@ -2247,24 +2247,25 @@ define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt
 ;
 ; GFNIAVX512VL-LABEL: splatvar_fshl_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; GFNIAVX512VL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; GFNIAVX512VL-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 & xmm2 & mem
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
-; GFNIAVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; GFNIAVX512VL-NEXT:    vpsllw %xmm2, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; GFNIAVX512VL-NEXT:    vpsllw %xmm3, %ymm5, %ymm5
 ; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
-; GFNIAVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; GFNIAVX512VL-NEXT:    vpsllw %xmm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; GFNIAVX512VL-NEXT:    vpsllw %xmm2, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsllw %xmm3, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; GFNIAVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: splatvar_fshl_v64i8:
@@ -2286,7 +2287,7 @@ define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt
 define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: splatvar_fshr_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT:    movd {{.*#+}} xmm9 = mem[0],zero,zero,zero
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm10
 ; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
 ; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9
@@ -2389,25 +2390,26 @@ define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt
 ;
 ; GFNIAVX512VL-LABEL: splatvar_fshr_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; GFNIAVX512VL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; GFNIAVX512VL-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 & xmm2 & mem
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
-; GFNIAVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm3, %ymm5, %ymm5
 ; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
-; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
-; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm3, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: splatvar_fshr_v64i8:
@@ -2775,7 +2777,7 @@ define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
 ; GFNIAVX512BW:       # %bb.0:
 ; GFNIAVX512BW-NEXT:    vpaddw %zmm0, %zmm0, %zmm2
 ; GFNIAVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm0
-; GFNIAVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; GFNIAVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <64 x i8> %res
@@ -2836,7 +2838,7 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
 ; GFNIAVX512BW:       # %bb.0:
 ; GFNIAVX512BW-NEXT:    vpsllw $6, %zmm0, %zmm2
 ; GFNIAVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm0
-; GFNIAVX512BW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; GFNIAVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 6e7f109a5da5c2..df5f8892c879f6 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1272,23 +1272,21 @@ define <8 x half> @select(i1 %c, <8 x half> %x, <8 x half> %y) {
 define <8 x half> @shuffle(ptr %p) {
 ; CHECK-LIBCALL-LABEL: shuffle:
 ; CHECK-LIBCALL:       # %bb.0:
-; CHECK-LIBCALL-NEXT:    movdqu (%rdi), %xmm0
-; CHECK-LIBCALL-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-LIBCALL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; CHECK-LIBCALL-NEXT:    pinsrw $0, 8(%rdi), %xmm0
+; CHECK-LIBCALL-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; CHECK-LIBCALL-NEXT:    retq
 ;
 ; BWON-F16C-LABEL: shuffle:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,4,4,4,4]
-; BWON-F16C-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; BWON-F16C-NEXT:    vpinsrw $0, 8(%rdi), %xmm0, %xmm0
+; BWON-F16C-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: shuffle:
 ; CHECK-I686:       # %bb.0:
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    movdqu (%eax), %xmm0
-; CHECK-I686-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-I686-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; CHECK-I686-NEXT:    pinsrw $0, 8(%eax), %xmm0
+; CHECK-I686-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; CHECK-I686-NEXT:    retl
   %1 = load <8 x half>, ptr %p, align 8
   %2 = shufflevector <8 x half> %1, <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index c44945ac2d929f..20aa93bf10ec28 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -145,9 +145,9 @@ define <2 x i64> @elt0_v2i64(i64 %x) {
 ;
 ; X64-SSE2-LABEL: elt0_v2i64:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movq %rdi, %xmm1
-; X64-SSE2-NEXT:    movapd {{.*#+}} xmm0 = [u,1]
-; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-SSE2-NEXT:    movq %rdi, %xmm0
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE4-LABEL: elt0_v2i64:
@@ -218,28 +218,26 @@ define <4 x float> @elt1_v4f32(float %x) {
 define <2 x double> @elt1_v2f64(double %x) {
 ; X86-SSE-LABEL: elt1_v2f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,u]
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: elt1_v2f64:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps {{.*#+}} xmm1 = [4.2E+1,u]
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; X64-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; X64-SSE-NEXT:    movaps %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; X86-AVX-LABEL: elt1_v2f64:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [4.2E+1,4.2E+1]
-; X86-AVX-NEXT:    # xmm0 = mem[0,0]
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-AVX-LABEL: elt1_v2f64:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
-; X64-AVX-NEXT:    # xmm1 = mem[0,0]
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64-AVX-NEXT:    retq
    %ins = insertelement <2 x double> <double 42.0, double 1.0>, double %x, i32 1
@@ -384,7 +382,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ; X64-SSE2-LABEL: elt5_v8i64:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movq %rdi, %xmm0
-; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4,u]
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = [4,0]
 ; X64-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
 ; X64-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,1]
 ; X64-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
@@ -457,7 +455,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 define <8 x double> @elt1_v8f64(double %x) {
 ; X86-SSE-LABEL: elt1_v8f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,u]
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0]
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0]
@@ -466,7 +464,7 @@ define <8 x double> @elt1_v8f64(double %x) {
 ;
 ; X64-SSE-LABEL: elt1_v8f64:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps {{.*#+}} xmm4 = [4.2E+1,u]
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm4 = [4.2E+1,0.0E+0]
 ; X64-SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
 ; X64-SSE-NEXT:    movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0]
 ; X64-SSE-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0]
@@ -476,47 +474,49 @@ define <8 x double> @elt1_v8f64(double %x) {
 ;
 ; X86-AVX1-LABEL: elt1_v8f64:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [4.2E+1,u,2.0E+0,3.0E+0]
-; X86-AVX1-NEXT:    vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
-; X86-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; X86-AVX1-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X86-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; X86-AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X86-AVX1-NEXT:    retl
 ;
 ; X64-AVX1-LABEL: elt1_v8f64:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [4.2E+1,u,2.0E+0,3.0E+0]
+; X64-AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; X64-AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X64-AVX1-NEXT:    retq
 ;
 ; X86-AVX2-LABEL: elt1_v8f64:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [4.2E+1,u,2.0E+0,3.0E+0]
-; X86-AVX2-NEXT:    vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
-; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X86-AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; X86-AVX2-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: elt1_v8f64:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.2E+1,u,2.0E+0,3.0E+0]
+; X64-AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; X64-AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X64-AVX2-NEXT:    retq
 ;
 ; X86-AVX512F-LABEL: elt1_v8f64:
 ; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
-; X86-AVX512F-NEXT:    vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
-; X86-AVX512F-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; X86-AVX512F-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X86-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
+; X86-AVX512F-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X64-AVX512F-LABEL: elt1_v8f64:
 ; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
+; X64-AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; X64-AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X64-AVX512F-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; X64-AVX512F-NEXT:    retq
    %ins = insertelement <8 x double> <double 42.0, double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0>, double %x, i32 1
diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll
index 18edc83b7edcf7..ee7af27b0ac706 100644
--- a/llvm/test/CodeGen/X86/insertps-combine.ll
+++ b/llvm/test/CodeGen/X86/insertps-combine.ll
@@ -269,12 +269,12 @@ define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) {
 define float @extract_lane_insertps_5123(<4 x float> %a0, ptr%p1) {
 ; SSE-LABEL: extract_lane_insertps_5123:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movshdup (%rdi), %xmm0 # xmm0 = mem[1,1,3,3]
+; SSE-NEXT:    movss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extract_lane_insertps_5123:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovshdup (%rdi), %xmm0 # xmm0 = mem[1,1,3,3]
+; AVX-NEXT:    vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    retq
   %a1 = load <4 x float>, ptr%p1
   %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 64)
@@ -285,13 +285,12 @@ define float @extract_lane_insertps_5123(<4 x float> %a0, ptr%p1) {
 define float @extract_lane_insertps_6123(<4 x float> %a0, ptr%p1) {
 ; SSE-LABEL: extract_lane_insertps_6123:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps (%rdi), %xmm0
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    movss 8(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extract_lane_insertps_6123:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilpd $1, (%rdi), %xmm0 # xmm0 = mem[1,0]
+; AVX-NEXT:    vmovss 8(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    retq
   %a1 = load <4 x float>, ptr%p1
   %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128)
@@ -308,7 +307,8 @@ define <4 x float> @commute_load_insertps(<4 x float>, ptr nocapture readonly) {
 ;
 ; AVX-LABEL: commute_load_insertps:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vinsertps $53, 12(%rdi), %xmm0, %xmm0 # xmm0 = zero,xmm0[1],zero,mem[0]
+; AVX-NEXT:    vbroadcastss 12(%rdi), %xmm1
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[1],zero,xmm1[3]
 ; AVX-NEXT:    retq
   %3 = load <4 x float>, ptr %1
   %4 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %3, <4 x float> %0, i8 85)
diff --git a/llvm/test/CodeGen/X86/insertps-from-constantpool.ll b/llvm/test/CodeGen/X86/insertps-from-constantpool.ll
index f03df634dc1de4..99ed327c36c3e2 100644
--- a/llvm/test/CodeGen/X86/insertps-from-constantpool.ll
+++ b/llvm/test/CodeGen/X86/insertps-from-constantpool.ll
@@ -7,12 +7,14 @@
 define <4 x float> @fold_from_constantpool(<4 x float> %a) {
 ; X86-LABEL: fold_from_constantpool:
 ; X86:       # %bb.0:
-; X86-NEXT:    insertps $0, {{\.?LCPI[0-9]+_[0-9]+}}+4, %xmm0 # xmm0 = mem[0],xmm0[1,2,3]
+; X86-NEXT:    movss {{\.?LCPI[0-9]+_[0-9]+}}+4, %xmm1 # xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fold_from_constantpool:
 ; X64:       # %bb.0:
-; X64-NEXT:    insertps $0, {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0 # xmm0 = mem[0],xmm0[1,2,3]
+; X64-NEXT:    movss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> <float 0.0, float 1.0, float 0.0, float 0.0>, i8 64)
   ret <4 x float> %1
diff --git a/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll b/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll
index 93b60c27255f33..29737b3acf55e5 100644
--- a/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll
+++ b/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll
@@ -11,7 +11,7 @@ define <4 x float> @insertps_unfold(ptr %v0, ptr %v1) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    movaps (%eax), %xmm0
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; X32-NEXT:    addps %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
@@ -19,7 +19,7 @@ define <4 x float> @insertps_unfold(ptr %v0, ptr %v1) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-NEXT:    movaps (%rdi), %xmm0
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; X64-NEXT:    addps %xmm1, %xmm0
 ; X64-NEXT:    retq
   %a = getelementptr inbounds <4 x float>, ptr %v1, i64 0, i64 1
diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index 97136dafa6c2c0..198d1a08a9342d 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -2595,8 +2595,9 @@ define i1 @issubnormal_or_zero_or_nan_f(float %x) {
 ;
 ; X64-LABEL: issubnormal_or_zero_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    andps %xmm0, %xmm1
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 243)  ; 0xf0|0x3 = "subnormal|zero|nan"
@@ -2764,8 +2765,9 @@ define i1 @not_issubnormal_or_zero_or_nan_f(float %x) {
 ;
 ; X64-LABEL: not_issubnormal_or_zero_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    andps %xmm0, %xmm1
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 780)  ; ~(0xf0|0x3) = ~"subnormal|zero|nan"
diff --git a/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll b/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll
index fb7efc2200c671..ad8878a6f83b7f 100644
--- a/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll
+++ b/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll
@@ -5,8 +5,11 @@ define void @csrot_(ptr %0) {
 ; CHECK-LABEL: csrot_:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    movlps %xmm0, (%rax)
 ; CHECK-NEXT:    retq
 1:
diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll
index dba63582ff08b1..3a1cfcb9c9a6fa 100644
--- a/llvm/test/CodeGen/X86/load-partial.ll
+++ b/llvm/test/CodeGen/X86/load-partial.ll
@@ -211,13 +211,11 @@ define <4 x float> @load_float4_float3_trunc_0123(ptr nocapture readonly derefer
 ; SSE2-LABEL: load_float4_float3_trunc_0123:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm0
-; SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_float4_float3_trunc_0123:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movaps (%rdi), %xmm0
-; SSSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: load_float4_float3_trunc_0123:
@@ -257,13 +255,11 @@ define <4 x float> @load_float4_float3_trunc_0123_unaligned(ptr nocapture readon
 ; SSE2-LABEL: load_float4_float3_trunc_0123_unaligned:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movups (%rdi), %xmm0
-; SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_float4_float3_trunc_0123_unaligned:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movups (%rdi), %xmm0
-; SSSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: load_float4_float3_trunc_0123_unaligned:
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 89459a2d10177d..7cba05d61a1c3d 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -6791,7 +6791,8 @@ define <8 x double> @mload_constmask_v8f64(ptr %addr, <8 x double> %dst) {
 ; AVX1OR2-LABEL: mload_constmask_v8f64:
 ; AVX1OR2:       ## %bb.0:
 ; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
-; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX1OR2-NEXT:    vbroadcastsd 56(%rdi), %ymm2
+; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX1OR2-NEXT:    retq
 ;
 ; AVX512F-LABEL: mload_constmask_v8f64:
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index c7ec5e87dcc6bd..1a8200c322973f 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -4701,13 +4701,13 @@ define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5
 ; SSE2-NEXT:    movq %xmm5, (%rdi)
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm5, 8(%rdi)
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = mem[2,3,2,3]
-; SSE2-NEXT:    movq %xmm5, 24(%rdi)
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq %rcx, 24(%rdi)
 ; SSE2-NEXT:    movq %rax, 32(%rdi)
 ; SSE2-NEXT:    movq %xmm4, 48(%rdi)
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
@@ -4733,11 +4733,10 @@ define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16
 ; SSE4-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
 ; SSE4-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
 ; SSE4-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
-; SSE4-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5
-; SSE4-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
-; SSE4-NEXT:    movups %xmm6, (%rdi)
-; SSE4-NEXT:    palignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; SSE4-NEXT:    movdqu %xmm5, 24(%rdi)
+; SSE4-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; SSE4-NEXT:    movups %xmm5, (%rdi)
+; SSE4-NEXT:    movups {{[0-9]+}}(%rsp), %xmm5
+; SSE4-NEXT:    movups %xmm5, 24(%rdi)
 ; SSE4-NEXT:    movups %xmm4, 48(%rdi)
 ; SSE4-NEXT:    movups %xmm3, 64(%rdi)
 ; SSE4-NEXT:    movups %xmm2, 80(%rdi)
diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index 73d459ba770264..751805db38ec06 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -325,14 +325,11 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    andq (%rsi), %rax
-; X64-NEXT:    movq %rax, %xmm0
 ; X64-NEXT:    movq %rax, (%rdi)
-; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    por %xmm0, %xmm1
-; X64-NEXT:    movq %xmm1, (%rdi)
-; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    movq %xmm0, (%rdi)
+; X64-NEXT:    orq (%rsi), %rax
+; X64-NEXT:    movq %rax, (%rdi)
+; X64-NEXT:    xorq (%rsi), %rax
+; X64-NEXT:    movq %rax, (%rdi)
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/neg_fp.ll b/llvm/test/CodeGen/X86/neg_fp.ll
index 80209825098198..84bbf18dde36a3 100644
--- a/llvm/test/CodeGen/X86/neg_fp.ll
+++ b/llvm/test/CodeGen/X86/neg_fp.ll
@@ -10,8 +10,9 @@ define float @negfp(float %a, float %b) nounwind {
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    subss {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm0, %xmm1
+; CHECK-NEXT:    movss %xmm1, (%esp)
 ; CHECK-NEXT:    flds (%esp)
 ; CHECK-NEXT:    popl %eax
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/negative-sin.ll b/llvm/test/CodeGen/X86/negative-sin.ll
index f24507d3a4f38a..3368dde860875e 100644
--- a/llvm/test/CodeGen/X86/negative-sin.ll
+++ b/llvm/test/CodeGen/X86/negative-sin.ll
@@ -56,7 +56,8 @@ define double @semi_strict1(double %e) nounwind {
 ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    callq sin at PLT
-; CHECK-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; CHECK-NEXT:    vxorpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
   %f = fsub double 0.0, %e
diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll
index 384e40496d82a6..ce82ad7857dda8 100644
--- a/llvm/test/CodeGen/X86/packus.ll
+++ b/llvm/test/CodeGen/X86/packus.ll
@@ -118,25 +118,45 @@ define <8 x i16> @trunc_lshr_v8i32(<8 x i32> %a) nounwind {
 }
 
 define <8 x i16> @trunc_lshr_v4i64_demandedelts(<4 x i64> %a0) {
-; SSE2-LABEL: trunc_lshr_v4i64_demandedelts:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    packuswb %xmm1, %xmm0
-; SSE2-NEXT:    ret{{[l|q]}}
+; X86-SSE2-LABEL: trunc_lshr_v4i64_demandedelts:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movd {{.*#+}} xmm2 = [1,0,0,0]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    packuswb %xmm1, %xmm0
+; X86-SSE2-NEXT:    retl
 ;
-; SSE4-LABEL: trunc_lshr_v4i64_demandedelts:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE4-NEXT:    pmovsxbd {{.*#+}} xmm2 = [1,1,1,1]
-; SSE4-NEXT:    pand %xmm2, %xmm1
-; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE4-NEXT:    pand %xmm2, %xmm0
-; SSE4-NEXT:    packusdw %xmm1, %xmm0
-; SSE4-NEXT:    ret{{[l|q]}}
+; X64-SSE2-LABEL: trunc_lshr_v4i64_demandedelts:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = [1,0]
+; X64-SSE2-NEXT:    pand %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT:    packuswb %xmm1, %xmm0
+; X64-SSE2-NEXT:    retq
+;
+; X86-SSE4-LABEL: trunc_lshr_v4i64_demandedelts:
+; X86-SSE4:       # %bb.0:
+; X86-SSE4-NEXT:    movd {{.*#+}} xmm2 = [1,0,0,0]
+; X86-SSE4-NEXT:    pand %xmm0, %xmm2
+; X86-SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0]
+; X86-SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE4-NEXT:    packusdw %xmm1, %xmm0
+; X86-SSE4-NEXT:    retl
+;
+; X64-SSE4-LABEL: trunc_lshr_v4i64_demandedelts:
+; X64-SSE4:       # %bb.0:
+; X64-SSE4-NEXT:    movq {{.*#+}} xmm2 = [1,0]
+; X64-SSE4-NEXT:    pand %xmm0, %xmm2
+; X64-SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0]
+; X64-SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X64-SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE4-NEXT:    packusdw %xmm1, %xmm0
+; X64-SSE4-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: trunc_lshr_v4i64_demandedelts:
 ; X86-AVX1:       # %bb.0:
@@ -447,8 +467,4 @@ define <32 x i8> @packuswb_icmp_zero_trunc_256(<16 x i16> %a0) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; X64-AVX2: {{.*}}
-; X64-SSE2: {{.*}}
-; X64-SSE4: {{.*}}
 ; X86-AVX2: {{.*}}
-; X86-SSE2: {{.*}}
-; X86-SSE4: {{.*}}
diff --git a/llvm/test/CodeGen/X86/peephole-fold-movsd.ll b/llvm/test/CodeGen/X86/peephole-fold-movsd.ll
index c0a6e00ec695e4..46d65e6f375e5f 100644
--- a/llvm/test/CodeGen/X86/peephole-fold-movsd.ll
+++ b/llvm/test/CodeGen/X86/peephole-fold-movsd.ll
@@ -18,7 +18,7 @@ define dso_local void @foo1(double %a.coerce0, double %a.coerce1, double %b.coer
 ; CHECK-NEXT:    movq %rsp, %rdi
 ; CHECK-NEXT:    callq foo3 at PLT
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,u]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; CHECK-NEXT:    addpd %xmm0, %xmm1
 ; CHECK-NEXT:    movapd %xmm1, g(%rip)
diff --git a/llvm/test/CodeGen/X86/pr14161.ll b/llvm/test/CodeGen/X86/pr14161.ll
index cdf3757e05b20d..a38ad03117855b 100644
--- a/llvm/test/CodeGen/X86/pr14161.ll
+++ b/llvm/test/CodeGen/X86/pr14161.ll
@@ -24,7 +24,8 @@ entry:
 define <2 x i16> @bad(ptr, ptr) {
 ; CHECK-LABEL: bad:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = mem[1,1,1,1]
+; CHECK-NEXT:    pinsrd $1, 4(%rdi), %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; CHECK-NEXT:    pminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/pr30511.ll b/llvm/test/CodeGen/X86/pr30511.ll
index 088f3bfef8542b..0a4428d0a74e52 100644
--- a/llvm/test/CodeGen/X86/pr30511.ll
+++ b/llvm/test/CodeGen/X86/pr30511.ll
@@ -7,8 +7,9 @@ target triple = "x86_64-pc-linux-gnu"
 define i64 @PR30511(<2 x double> %a) {
 ; CHECK-LABEL: PR30511:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvtdq2pd %xmm0, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [6.755399441055744E+15,0.0E+0]
+; CHECK-NEXT:    addpd %xmm0, %xmm1
+; CHECK-NEXT:    cvtdq2pd %xmm1, %xmm0
 ; CHECK-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    movq %xmm0, %rax
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr31956.ll b/llvm/test/CodeGen/X86/pr31956.ll
index 38b55a5c32a617..692cdaff33fc1e 100644
--- a/llvm/test/CodeGen/X86/pr31956.ll
+++ b/llvm/test/CodeGen/X86/pr31956.ll
@@ -9,10 +9,11 @@ target triple = "x86_64-scei-ps4"
 define <4 x float> @foo() {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovaps G2(%rip), %xmm0
-; CHECK-NEXT:    vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
+; CHECK-NEXT:    vbroadcastss G2+16(%rip), %xmm0
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],zero,zero
+; CHECK-NEXT:    vbroadcastss G2+24(%rip), %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; CHECK-NEXT:    retq
 entry:
   %V = load <2 x float>, ptr @G1, align 8
diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll
index aed5ea3ed217b7..517d93f487883e 100644
--- a/llvm/test/CodeGen/X86/pr34592.ll
+++ b/llvm/test/CodeGen/X86/pr34592.ll
@@ -52,24 +52,24 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
 ; CHECK-O3-NEXT:    movq %rsp, %rbp
 ; CHECK-O3-NEXT:    andq $-32, %rsp
 ; CHECK-O3-NEXT:    subq $32, %rsp
-; CHECK-O3-NEXT:    vmovdqa 208(%rbp), %ymm3
-; CHECK-O3-NEXT:    vmovdqa 144(%rbp), %ymm0
-; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-O3-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; CHECK-O3-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; CHECK-O3-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,1,2,1]
-; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; CHECK-O3-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1]
-; CHECK-O3-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
-; CHECK-O3-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
-; CHECK-O3-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; CHECK-O3-NEXT:    vpbroadcastq 248(%rbp), %ymm4
-; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5],ymm1[6,7]
-; CHECK-O3-NEXT:    vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
-; CHECK-O3-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
-; CHECK-O3-NEXT:    vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
-; CHECK-O3-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7]
+; CHECK-O3-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-O3-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-O3-NEXT:    vbroadcastsd 160(%rbp), %ymm3
+; CHECK-O3-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
+; CHECK-O3-NEXT:    vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; CHECK-O3-NEXT:    vbroadcastsd 216(%rbp), %ymm4
+; CHECK-O3-NEXT:    vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; CHECK-O3-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2]
+; CHECK-O3-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,1]
+; CHECK-O3-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-O3-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; CHECK-O3-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1]
+; CHECK-O3-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
+; CHECK-O3-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
+; CHECK-O3-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
+; CHECK-O3-NEXT:    vbroadcastsd 248(%rbp), %ymm4
+; CHECK-O3-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5],ymm1[6,7]
 ; CHECK-O3-NEXT:    movq %rbp, %rsp
 ; CHECK-O3-NEXT:    popq %rbp
 ; CHECK-O3-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr36553.ll b/llvm/test/CodeGen/X86/pr36553.ll
index b61ec814730811..17649d43352d29 100644
--- a/llvm/test/CodeGen/X86/pr36553.ll
+++ b/llvm/test/CodeGen/X86/pr36553.ll
@@ -8,7 +8,8 @@ define float @pr36553(float %a, float %b, float %c) nounwind {
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    callq _fmaf
-; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/pr40811.ll b/llvm/test/CodeGen/X86/pr40811.ll
index 7851856713e82a..63bfbcec1e1dae 100644
--- a/llvm/test/CodeGen/X86/pr40811.ll
+++ b/llvm/test/CodeGen/X86/pr40811.ll
@@ -4,10 +4,11 @@
 define <8 x i32> @_Z6test70v(ptr %id14793) {
 ; CHECK-LABEL: _Z6test70v:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
-; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2,3]
-; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,1,0]
-; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2,1,0]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-NEXT:    vpinsrd $1, {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,0]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,0]
 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/pr63091.ll b/llvm/test/CodeGen/X86/pr63091.ll
index 3f50be8ab8df9a..9f4700e94df681 100644
--- a/llvm/test/CodeGen/X86/pr63091.ll
+++ b/llvm/test/CodeGen/X86/pr63091.ll
@@ -35,9 +35,10 @@ define <4 x i32> @dont_merge_pcmpgt(<16 x i8> %0, <4 x i32> %1) {
 define <4 x i32> @merge_and(<16 x i8> %0, <4 x i32> %1) {
 ; SSE-LABEL: merge_and:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pinsrd $3, {{\.?LCPI[0-9]+_[0-9]+}}+12(%rip), %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: merge_and:
diff --git a/llvm/test/CodeGen/X86/sar_fold64.ll b/llvm/test/CodeGen/X86/sar_fold64.ll
index 245af74c238912..234dbbd620d514 100644
--- a/llvm/test/CodeGen/X86/sar_fold64.ll
+++ b/llvm/test/CodeGen/X86/sar_fold64.ll
@@ -99,16 +99,18 @@ define <4 x i32> @all_sign_bit_ashr_vec0(<4 x i32> %x) {
 define <4 x i32> @all_sign_bit_ashr_vec1(<4 x i32> %x) {
 ; SSE-LABEL: all_sign_bit_ashr_vec1:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    movd {{.*#+}} xmm1 = [1,0,0,0]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE-NEXT:    pxor %xmm0, %xmm0
 ; SSE-NEXT:    psubd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: all_sign_bit_ashr_vec1:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
@@ -159,16 +161,18 @@ define <4 x i32> @all_sign_bit_ashr_vec2(<4 x i32> %x) {
 define <4 x i32> @all_sign_bit_ashr_vec3(<4 x i32> %x) {
 ; SSE-LABEL: all_sign_bit_ashr_vec3:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    movd {{.*#+}} xmm1 = [1,0,0,0]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: all_sign_bit_ashr_vec3:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll
index 0745881b2f3a32..d8f0c135c5afec 100644
--- a/llvm/test/CodeGen/X86/setcc-combine.ll
+++ b/llvm/test/CodeGen/X86/setcc-combine.ll
@@ -357,7 +357,7 @@ define i64 @sub_constant_to_shift_to_add(i32 %x, i64 %s1, i64 %s2) {
 define float @olt(float %x) {
 ; CHECK-LABEL: olt:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    xorps %xmm0, %xmm1
 ; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -370,7 +370,7 @@ define float @olt(float %x) {
 define double @ogt(double %x) {
 ; CHECK-LABEL: ogt:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; CHECK-NEXT:    xorpd %xmm0, %xmm1
 ; CHECK-NEXT:    maxsd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -486,7 +486,7 @@ define double @ogt_no_fneg(double %x, double %y) {
 define double @ogt_no_zero(double %x) {
 ; CHECK-LABEL: ogt_no_zero:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; CHECK-NEXT:    xorpd %xmm0, %xmm1
 ; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    cmpltsd %xmm0, %xmm2
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index 2ac2be5545dfdc..a69f13839e53fa 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -119,22 +119,18 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-AVX2-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-AVX2-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %xmm5
-; CHECK-AVX2-NEXT:    vmovdqu 1040(%rdx,%rsi), %xmm6
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %rdi
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm6, %r8
-; CHECK-AVX2-NEXT:    vmovq %xmm5, %r9
-; CHECK-AVX2-NEXT:    vmovq %xmm6, %r10
-; CHECK-AVX2-NEXT:    negq %r10
-; CHECK-AVX2-NEXT:    movq %rcx, %r10
-; CHECK-AVX2-NEXT:    sbbq %r8, %r10
-; CHECK-AVX2-NEXT:    setge %r8b
-; CHECK-AVX2-NEXT:    movzbl %r8b, %r8d
+; CHECK-AVX2-NEXT:    movq 1040(%rdx,%rsi), %rdi
+; CHECK-AVX2-NEXT:    movq 1024(%rdx,%rsi), %r8
+; CHECK-AVX2-NEXT:    negq %rdi
+; CHECK-AVX2-NEXT:    movq %rcx, %rdi
+; CHECK-AVX2-NEXT:    sbbq 1048(%rdx,%rsi), %rdi
+; CHECK-AVX2-NEXT:    setge %dil
+; CHECK-AVX2-NEXT:    movzbl %dil, %edi
+; CHECK-AVX2-NEXT:    negq %rdi
+; CHECK-AVX2-NEXT:    vmovq %rdi, %xmm5
 ; CHECK-AVX2-NEXT:    negq %r8
-; CHECK-AVX2-NEXT:    vmovq %r8, %xmm5
-; CHECK-AVX2-NEXT:    negq %r9
-; CHECK-AVX2-NEXT:    movq %rcx, %r8
-; CHECK-AVX2-NEXT:    sbbq %rdi, %r8
+; CHECK-AVX2-NEXT:    movq %rcx, %rdi
+; CHECK-AVX2-NEXT:    sbbq 1032(%rdx,%rsi), %rdi
 ; CHECK-AVX2-NEXT:    setge %dil
 ; CHECK-AVX2-NEXT:    movzbl %dil, %edi
 ; CHECK-AVX2-NEXT:    negq %rdi
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index e53eed45877975..e83151f3eaa1e8 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1864,9 +1864,10 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    psllq $32, %xmm0
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = [65536,0,0,0]
+; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X86-SSE-NEXT:    psllq $32, %xmm1
+; X86-SSE-NEXT:    movq %xmm1, (%edx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_varconst3:
@@ -1885,9 +1886,10 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X64-SSE-NEXT:    movq c(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT:    psllq $32, %xmm0
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = [65536,0,0,0]
+; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT:    psllq $32, %xmm1
+; X64-SSE-NEXT:    movq %xmm1, (%rax,%rsi,4)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: mul_2xi16_varconst3:
@@ -1922,9 +1924,10 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    psrad $16, %xmm0
-; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    psllq $32, %xmm0
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = [32768,0,0,0]
+; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X86-SSE-NEXT:    psllq $32, %xmm1
+; X86-SSE-NEXT:    movq %xmm1, (%edx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_varconst4:
@@ -1943,9 +1946,10 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
 ; X64-SSE-NEXT:    movq c(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-SSE-NEXT:    psrad $16, %xmm0
-; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT:    psllq $32, %xmm0
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
+; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = [32768,0,0,0]
+; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT:    psllq $32, %xmm1
+; X64-SSE-NEXT:    movq %xmm1, (%rax,%rsi,4)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: mul_2xi16_varconst4:
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index ec442c185706cf..3e880589566cc3 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -384,22 +384,26 @@ define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) {
 define <4 x double> @PR34175(ptr %p) {
 ; AVX512F-LABEL: PR34175:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqu 32(%rdi), %xmm1
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vpinsrw $0, 48(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT:    vpinsrw $0, 32(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512F-NEXT:    vpinsrw $0, 16(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512F-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: PR34175:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqu 32(%rdi), %xmm1
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vpinsrw $0, 48(%rdi), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpinsrw $0, 32(%rdi), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512VL-NEXT:    vpinsrw $0, 16(%rdi), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm2
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512VL-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512VL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll
index 76a95e25045700..459f128472ef02 100644
--- a/llvm/test/CodeGen/X86/splat-for-size.ll
+++ b/llvm/test/CodeGen/X86/splat-for-size.ll
@@ -388,13 +388,13 @@ define <8 x i64> @pr23259() #1 {
 ; AVX-LABEL: pr23259:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm1[2,3]
+; AVX-NEXT:    vpinsrq $0, A+16(%rip), %xmm1, %xmm0
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: pr23259:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovaps A+16(%rip), %xmm0
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3,4,5,6,7]
 ; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
 ; AVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
index 2b78a70ebcc26f..fad383c7b46b0c 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
@@ -20,7 +20,8 @@ define float @f32_no_daz(float %f) #0 {
 ; NHM-NEXT:    mulss %xmm1, %xmm2
 ; NHM-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; NHM-NEXT:    mulss %xmm3, %xmm2
-; NHM-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; NHM-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; NHM-NEXT:    andps %xmm1, %xmm0
 ; NHM-NEXT:    cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; NHM-NEXT:    andnps %xmm2, %xmm0
 ; NHM-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll
index 85f7733e671a76..31de79ea0fe64b 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll
@@ -12,7 +12,8 @@ define float @f32_tune_nhm(float %f) #0 {
 ; CHECK-NEXT:    mulss %xmm1, %xmm2
 ; CHECK-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; CHECK-NEXT:    mulss %xmm3, %xmm2
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    andnps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -49,7 +50,8 @@ define float @f32_tune_x86_64(float %f) #3 {
 ; CHECK-NEXT:    mulss %xmm1, %xmm2
 ; CHECK-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; CHECK-NEXT:    mulss %xmm3, %xmm2
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    andnps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 384f8b832afb95..0b304136ccfea3 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -764,16 +764,18 @@ define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x
 define double @div_sqrt_fabs_f64(double %x, double %y, double %z) {
 ; SSE-LABEL: div_sqrt_fabs_f64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE-NEXT:    sqrtsd %xmm2, %xmm2
-; SSE-NEXT:    mulsd %xmm2, %xmm1
-; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm3 = [NaN,0.0E+0]
+; SSE-NEXT:    andpd %xmm1, %xmm3
+; SSE-NEXT:    mulsd %xmm2, %xmm3
+; SSE-NEXT:    divsd %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: div_sqrt_fabs_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX-NEXT:    vsqrtsd %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = [NaN,0.0E+0]
+; AVX-NEXT:    vandpd %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vmulsd %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 08d9183bd30b67..d794340d14701d 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -1014,7 +1014,8 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; CHECK-SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
 ; CHECK-SSE2-NEXT:    pand %xmm0, %xmm2
 ; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
@@ -1036,7 +1037,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-LABEL: test_srem_odd_INT_MIN:
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
+; CHECK-SSE41-NEXT:    pinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm2
 ; CHECK-SSE41-NEXT:    pand %xmm0, %xmm2
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
 ; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1051,7 +1052,8 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-LABEL: test_srem_odd_INT_MIN:
 ; CHECK-AVX1:       # %bb.0:
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT:    vpinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm2
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm1
 ; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1122,10 +1124,12 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; CHECK-SSE2-NEXT:    pxor %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; CHECK-SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; CHECK-SSE2-NEXT:    pand %xmm0, %xmm3
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2]
 ; CHECK-SSE2-NEXT:    psrld $31, %xmm1
 ; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -1147,7 +1151,8 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [306783378,306783378,1,306783378]
 ; CHECK-SSE41-NEXT:    pminud %xmm3, %xmm2
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
-; CHECK-SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm3
+; CHECK-SSE41-NEXT:    pand %xmm3, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -1168,7 +1173,8 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm3
+; CHECK-AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -1238,10 +1244,12 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; CHECK-SSE2-NEXT:    pxor %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; CHECK-SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; CHECK-SSE2-NEXT:    pand %xmm0, %xmm3
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2]
 ; CHECK-SSE2-NEXT:    psrld $31, %xmm1
 ; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -1263,7 +1271,8 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [858993458,306783378,1,42949672]
 ; CHECK-SSE41-NEXT:    pminud %xmm3, %xmm2
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
-; CHECK-SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm3
+; CHECK-SSE41-NEXT:    pand %xmm3, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -1284,7 +1293,8 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm3
+; CHECK-AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -2211,59 +2221,61 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-SSE2-LABEL: pr51133:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    movq %rdi, %rax
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm5
-; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [9,0,41,183,1,1,161,221]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; CHECK-SSE2-NEXT:    pand %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm6
-; CHECK-SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [171,103,183,171,61,1,127,183]
-; CHECK-SSE2-NEXT:    pand %xmm4, %xmm6
-; CHECK-SSE2-NEXT:    packuswb %xmm5, %xmm6
-; CHECK-SSE2-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; CHECK-SSE2-NEXT:    movdqa %xmm6, %xmm5
-; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [128,1,128,1,128,32,1,1]
-; CHECK-SSE2-NEXT:    psrlw $8, %xmm5
-; CHECK-SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [1,1,1,128,64,2,1,32]
-; CHECK-SSE2-NEXT:    psrlw $8, %xmm6
-; CHECK-SSE2-NEXT:    packuswb %xmm5, %xmm6
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
-; CHECK-SSE2-NEXT:    pminub %xmm6, %xmm7
-; CHECK-SSE2-NEXT:    pcmpeqb %xmm6, %xmm7
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SSE2-NEXT:    pandn %xmm5, %xmm7
-; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm6, %xmm6
-; CHECK-SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
-; CHECK-SSE2-NEXT:    pandn %xmm1, %xmm5
-; CHECK-SSE2-NEXT:    por %xmm7, %xmm5
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; CHECK-SSE2-NEXT:    movzbl {{\.?LCPI[0-9]+_[0-9]+}}+5(%rip), %ecx
+; CHECK-SSE2-NEXT:    movd %ecx, %xmm6
+; CHECK-SSE2-NEXT:    psllq $40, %xmm6
+; CHECK-SSE2-NEXT:    pand %xmm1, %xmm6
 ; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [223,223,205,183,161,1,171,239]
-; CHECK-SSE2-NEXT:    pand %xmm4, %xmm1
+; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [9,0,41,183,1,1,161,221]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; CHECK-SSE2-NEXT:    pand %xmm5, %xmm1
+; CHECK-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [171,103,183,171,61,1,127,183]
+; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT:    packuswb %xmm1, %xmm4
+; CHECK-SSE2-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,1,128,1,128,32,1,1]
+; CHECK-SSE2-NEXT:    psrlw $8, %xmm1
+; CHECK-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [1,1,1,128,64,2,1,32]
+; CHECK-SSE2-NEXT:    psrlw $8, %xmm4
+; CHECK-SSE2-NEXT:    packuswb %xmm1, %xmm4
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
+; CHECK-SSE2-NEXT:    pminub %xmm4, %xmm7
+; CHECK-SSE2-NEXT:    pcmpeqb %xmm4, %xmm7
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SSE2-NEXT:    pandn %xmm4, %xmm7
+; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    pcmpgtb %xmm1, %xmm6
+; CHECK-SSE2-NEXT:    pandn %xmm6, %xmm4
+; CHECK-SSE2-NEXT:    por %xmm7, %xmm4
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [223,223,205,183,161,1,171,239]
+; CHECK-SSE2-NEXT:    pand %xmm5, %xmm6
 ; CHECK-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,205,27,241,1,1,1,163]
-; CHECK-SSE2-NEXT:    pand %xmm4, %xmm0
-; CHECK-SSE2-NEXT:    packuswb %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pand %xmm5, %xmm0
+; CHECK-SSE2-NEXT:    packuswb %xmm6, %xmm0
 ; CHECK-SSE2-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,128,1,1,1,128,1,64]
-; CHECK-SSE2-NEXT:    psrlw $8, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; CHECK-SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [128,128,1,1,1,128,1,64]
+; CHECK-SSE2-NEXT:    psrlw $8, %xmm5
 ; CHECK-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,1,128,128,32,128,32]
 ; CHECK-SSE2-NEXT:    psrlw $8, %xmm0
-; CHECK-SSE2-NEXT:    packuswb %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
-; CHECK-SSE2-NEXT:    pmaxub %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqb %xmm6, %xmm3
-; CHECK-SSE2-NEXT:    pandn %xmm5, %xmm3
-; CHECK-SSE2-NEXT:    pcmpeqb %xmm6, %xmm2
-; CHECK-SSE2-NEXT:    pandn %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    packuswb %xmm5, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
+; CHECK-SSE2-NEXT:    pmaxub %xmm0, %xmm5
+; CHECK-SSE2-NEXT:    pcmpeqb %xmm0, %xmm5
+; CHECK-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
+; CHECK-SSE2-NEXT:    pandn %xmm4, %xmm3
+; CHECK-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pandn %xmm5, %xmm2
 ; CHECK-SSE2-NEXT:    pmovmskb %xmm2, %ecx
 ; CHECK-SSE2-NEXT:    pmovmskb %xmm3, %edx
 ; CHECK-SSE2-NEXT:    shll $16, %edx
@@ -2474,7 +2486,9 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; CHECK-AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
 ; CHECK-AVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-AVX512VL-NEXT:    vpinsrb $5, {{\.?LCPI[0-9]+_[0-9]+}}+21(%rip), %xmm0, %xmm4
+; CHECK-AVX512VL-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; CHECK-AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; CHECK-AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; CHECK-AVX512VL-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
 ; CHECK-AVX512VL-NEXT:    vpandn %ymm0, %ymm3, %ymm3
diff --git a/llvm/test/CodeGen/X86/sse-align-12.ll b/llvm/test/CodeGen/X86/sse-align-12.ll
index 7b4bd3ffdf00c5..b8873e3839cd2b 100644
--- a/llvm/test/CodeGen/X86/sse-align-12.ll
+++ b/llvm/test/CodeGen/X86/sse-align-12.ll
@@ -54,8 +54,8 @@ define <2 x double> @c(ptr %y) nounwind {
 define <2 x double> @d(ptr %y, <2 x double> %z) nounwind {
 ; CHECK-LABEL: d:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movups (%rdi), %xmm1
-; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
 ; CHECK-NEXT:    retq
   %x = load <2 x double>, ptr %y, align 8
   %a = extractelement <2 x double> %x, i32 1
diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll
index cf5f527b16114f..67c0f393c08605 100644
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@@ -392,19 +392,19 @@ define <2 x double> @test11(double %a, double %b) nounwind {
 define void @test12() nounwind {
 ; SSE-LABEL: test12:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movapd 0, %xmm0
-; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE-NEXT:    xorps %xmm2, %xmm2
-; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
-; SSE-NEXT:    addps %xmm1, %xmm2
-; SSE-NEXT:    movaps %xmm2, 0
+; SSE-NEXT:    movaps 0, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
+; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE-NEXT:    addps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, 0
 ; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX1-LABEL: test12:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovaps 0, %xmm0
-; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
 ; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -529,29 +529,45 @@ define <4 x float> @test15(ptr %x, ptr %y) nounwind {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movaps (%ecx), %xmm0
-; X86-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-SSE-NEXT:    retl
 ;
-; X86-AVX-LABEL: test15:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
-; X86-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; X86-AVX-NEXT:    retl
+; X86-AVX1-LABEL: test15:
+; X86-AVX1:       # %bb.0: # %entry
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; X86-AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX512-LABEL: test15:
+; X86-AVX512:       # %bb.0: # %entry
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X86-AVX512-NEXT:    vunpcklpd 8(%eax){1to2}, %xmm0, %xmm0
+; X86-AVX512-NEXT:    retl
 ;
 ; X64-SSE-LABEL: test15:
 ; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movaps (%rdi), %xmm0
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X64-SSE-NEXT:    retq
 ;
-; X64-AVX-LABEL: test15:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
-; X64-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; X64-AVX-NEXT:    retq
+; X64-AVX1-LABEL: test15:
+; X64-AVX1:       # %bb.0: # %entry
+; X64-AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; X64-AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: test15:
+; X64-AVX512:       # %bb.0: # %entry
+; X64-AVX512-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX512-NEXT:    vunpcklpd 8(%rsi){1to2}, %xmm0, %xmm0
+; X64-AVX512-NEXT:    retq
 entry:
   %tmp = load <4 x float>, ptr %y             ; <<4 x float>> [#uses=1]
   %tmp3 = load <4 x float>, ptr %x            ; <<4 x float>> [#uses=1]
@@ -565,27 +581,27 @@ define  <2 x double> @test16(ptr nocapture %srcA, ptr nocapture %dst) {
 ; X86-SSE-LABEL: test16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movaps 96(%eax), %xmm0
-; X86-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test16:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    vmovaps 96(%eax), %xmm0
-; X86-AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: test16:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps 96(%rdi), %xmm0
-; X64-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test16:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovaps 96(%rdi), %xmm0
-; X64-AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X64-AVX-NEXT:    retq
   %i5 = getelementptr inbounds <4 x double>, ptr %srcA, i32 3
   %i6 = load <4 x double>, ptr %i5, align 32
@@ -700,8 +716,3 @@ define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
   %m = mul <4 x i32> %x, %y
   ret <4 x i32> %m
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; X64-AVX1: {{.*}}
-; X64-AVX512: {{.*}}
-; X86-AVX1: {{.*}}
-; X86-AVX512: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll
index 1a4df9a175ffad..2d3008d980a6da 100644
--- a/llvm/test/CodeGen/X86/sse3.ll
+++ b/llvm/test/CodeGen/X86/sse3.ll
@@ -39,20 +39,22 @@ define <8 x i16> @t1(ptr %A, ptr %B) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X86-NEXT:    movaps %xmm0, %xmm1
-; X86-NEXT:    andnps (%ecx), %xmm1
-; X86-NEXT:    andps (%eax), %xmm0
-; X86-NEXT:    orps %xmm1, %xmm0
+; X86-NEXT:    movaps (%eax), %xmm2
+; X86-NEXT:    andps %xmm0, %xmm2
+; X86-NEXT:    andnps %xmm1, %xmm0
+; X86-NEXT:    orps %xmm2, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t1:
 ; X64:       # %bb.0:
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X64-NEXT:    movaps %xmm0, %xmm1
-; X64-NEXT:    andnps (%rsi), %xmm1
-; X64-NEXT:    andps (%rdi), %xmm0
-; X64-NEXT:    orps %xmm1, %xmm0
+; X64-NEXT:    movaps (%rdi), %xmm2
+; X64-NEXT:    andps %xmm0, %xmm2
+; X64-NEXT:    andnps %xmm1, %xmm0
+; X64-NEXT:    orps %xmm2, %xmm0
 ; X64-NEXT:    retq
 	%tmp1 = load <8 x i16>, ptr %A
 	%tmp2 = load <8 x i16>, ptr %B
@@ -395,14 +397,14 @@ entry:
 define <4 x i32> @t17() nounwind {
 ; X86-LABEL: t17:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t17:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; X64-NEXT:    retq
 entry:
   %tmp1 = load <4 x float>, ptr undef, align 16
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 2d7258a49f5d09..07d7f82d809067 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -560,46 +560,40 @@ define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, ptr nocapture
 ; X86-SSE-LABEL: insertps_from_shufflevector_1:
 ; X86-SSE:       ## %bb.0: ## %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
-; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-SSE-NEXT:    insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_shufflevector_1:
 ; X86-AVX1:       ## %bb.0: ## %entry
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-AVX1-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: insertps_from_shufflevector_1:
 ; X86-AVX512:       ## %bb.0: ## %entry
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-AVX512-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: insertps_from_shufflevector_1:
 ; X64-SSE:       ## %bb.0: ## %entry
-; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
-; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-SSE-NEXT:    insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: insertps_from_shufflevector_1:
 ; X64-AVX1:       ## %bb.0: ## %entry
-; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-AVX1-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: insertps_from_shufflevector_1:
 ; X64-AVX512:       ## %bb.0: ## %entry
-; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-AVX512-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
 entry:
   %0 = load <4 x float>, ptr %pb, align 16
@@ -636,8 +630,10 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, ptr nocapture read
 ; X86-SSE-LABEL: pinsrd_from_shufflevector_i32:
 ; X86-SSE:       ## %bb.0: ## %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    pshufd $0, (%eax), %xmm1 ## encoding: [0x66,0x0f,0x70,0x08,0x00]
-; X86-SSE-NEXT:    ## xmm1 = mem[0,0,0,0]
+; X86-SSE-NEXT:    movd (%eax), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    ## encoding: [0x66,0x0f,0x6e,0x08]
+; X86-SSE-NEXT:    pshufd $0, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x00]
+; X86-SSE-NEXT:    ## xmm1 = xmm1[0,0,0,0]
 ; X86-SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
 ; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
@@ -660,8 +656,10 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, ptr nocapture read
 ;
 ; X64-SSE-LABEL: pinsrd_from_shufflevector_i32:
 ; X64-SSE:       ## %bb.0: ## %entry
-; X64-SSE-NEXT:    pshufd $0, (%rdi), %xmm1 ## encoding: [0x66,0x0f,0x70,0x0f,0x00]
-; X64-SSE-NEXT:    ## xmm1 = mem[0,0,0,0]
+; X64-SSE-NEXT:    movd (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    ## encoding: [0x66,0x0f,0x6e,0x0f]
+; X64-SSE-NEXT:    pshufd $0, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x00]
+; X64-SSE-NEXT:    ## xmm1 = xmm1[0,0,0,0]
 ; X64-SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
 ; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
@@ -1372,46 +1370,40 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, ptr nocapture read
 ; X86-SSE-LABEL: insertps_from_vector_load:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
-; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-SSE-NEXT:    insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_vector_load:
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-AVX1-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: insertps_from_vector_load:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-AVX512-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: insertps_from_vector_load:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
-; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-SSE-NEXT:    insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: insertps_from_vector_load:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-AVX1-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: insertps_from_vector_load:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
-; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-AVX512-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %1 = load <4 x float>, ptr %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -1424,46 +1416,40 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, ptr nocaptu
 ; X86-SSE-LABEL: insertps_from_vector_load_offset:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
-; X86-SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X86-SSE-NEXT:    insertps $32, 4(%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x40,0x04,0x20]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_vector_load_offset:
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X86-AVX1-NEXT:    vinsertps $32, 4(%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x40,0x04,0x20]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: insertps_from_vector_load_offset:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X86-AVX512-NEXT:    vinsertps $32, 4(%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x40,0x04,0x20]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: insertps_from_vector_load_offset:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
-; X64-SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
-; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X64-SSE-NEXT:    insertps $32, 4(%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x47,0x04,0x20]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: insertps_from_vector_load_offset:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
-; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X64-AVX1-NEXT:    vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x47,0x04,0x20]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: insertps_from_vector_load_offset:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
-; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X64-AVX512-NEXT:    vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x47,0x04,0x20]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %1 = load <4 x float>, ptr %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -1477,9 +1463,10 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-SSE-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
-; X86-SSE-NEXT:    movaps (%eax,%ecx), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x08]
-; X86-SSE-NEXT:    insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0]
-; X86-SSE-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
+; X86-SSE-NEXT:    movss 12(%eax,%ecx), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x4c,0x08,0x0c]
+; X86-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
+; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_vector_load_offset_2:
@@ -1487,9 +1474,10 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-AVX1-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
-; X86-AVX1-NEXT:    vmovaps (%eax,%ecx), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x08]
-; X86-AVX1-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
-; X86-AVX1-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
+; X86-AVX1-NEXT:    vmovss 12(%eax,%ecx), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x08,0x0c]
+; X86-AVX1-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: insertps_from_vector_load_offset_2:
@@ -1497,33 +1485,37 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-AVX512-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
-; X86-AVX512-NEXT:    vmovaps (%eax,%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x08]
-; X86-AVX512-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
-; X86-AVX512-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
+; X86-AVX512-NEXT:    vmovss 12(%eax,%ecx), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x08,0x0c]
+; X86-AVX512-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: insertps_from_vector_load_offset_2:
 ; X64-SSE:       ## %bb.0:
 ; X64-SSE-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
-; X64-SSE-NEXT:    movaps (%rdi,%rsi), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x37]
-; X64-SSE-NEXT:    insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0]
-; X64-SSE-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
+; X64-SSE-NEXT:    movss 12(%rdi,%rsi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x4c,0x37,0x0c]
+; X64-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
+; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: insertps_from_vector_load_offset_2:
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
-; X64-AVX1-NEXT:    vmovaps (%rdi,%rsi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x37]
-; X64-AVX1-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
-; X64-AVX1-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
+; X64-AVX1-NEXT:    vmovss 12(%rdi,%rsi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x37,0x0c]
+; X64-AVX1-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: insertps_from_vector_load_offset_2:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
-; X64-AVX512-NEXT:    vmovaps (%rdi,%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x37]
-; X64-AVX512-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
-; X64-AVX512-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
+; X64-AVX512-NEXT:    vmovss 12(%rdi,%rsi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x37,0x0c]
+; X64-AVX512-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %1 = getelementptr inbounds <4 x float>, ptr %pb, i64 %index
   %2 = load <4 x float>, ptr %1, align 16
@@ -1587,9 +1579,8 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, ptr nocapt
 ; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08]
-; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X86-SSE-NEXT:    insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
@@ -1608,9 +1599,8 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, ptr nocapt
 ;
 ; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f]
-; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
-; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-SSE-NEXT:    insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
@@ -1819,46 +1809,40 @@ define <4 x float> @pr20087(<4 x float> %a, ptr%ptr) {
 ; X86-SSE-LABEL: pr20087:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
-; X86-SSE-NEXT:    insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
+; X86-SSE-NEXT:    insertps $50, 8(%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x40,0x08,0x32]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: pr20087:
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX1-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
+; X86-AVX1-NEXT:    vinsertps $50, 8(%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x40,0x08,0x32]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: pr20087:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
-; X86-AVX512-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
+; X86-AVX512-NEXT:    vinsertps $50, 8(%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x40,0x08,0x32]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: pr20087:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
-; X64-SSE-NEXT:    insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2]
-; X64-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
+; X64-SSE-NEXT:    insertps $50, 8(%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x47,0x08,0x32]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: pr20087:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX1-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
-; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
+; X64-AVX1-NEXT:    vinsertps $50, 8(%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x47,0x08,0x32]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: pr20087:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
-; X64-AVX512-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
-; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
+; X64-AVX512-NEXT:    vinsertps $50, 8(%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x47,0x08,0x32]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],mem[0]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %load = load <4 x float> , ptr%ptr
   %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
diff --git a/llvm/test/CodeGen/X86/strict-fsub-combines.ll b/llvm/test/CodeGen/X86/strict-fsub-combines.ll
index 774ea02ccd87a4..be491bc330129e 100644
--- a/llvm/test/CodeGen/X86/strict-fsub-combines.ll
+++ b/llvm/test/CodeGen/X86/strict-fsub-combines.ll
@@ -8,9 +8,10 @@ define float @fneg_strict_fsub_to_strict_fadd(float %x, float %y) nounwind stric
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT:    subss %xmm1, %xmm0
+; X86-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    xorps %xmm1, %xmm2
+; X86-NEXT:    subss %xmm2, %xmm0
 ; X86-NEXT:    movss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    wait
@@ -19,8 +20,9 @@ define float @fneg_strict_fsub_to_strict_fadd(float %x, float %y) nounwind stric
 ;
 ; X64-LABEL: fneg_strict_fsub_to_strict_fadd:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-NEXT:    subss %xmm1, %xmm0
+; X64-NEXT:    movss {{.*#+}} xmm2 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    xorps %xmm1, %xmm2
+; X64-NEXT:    subss %xmm2, %xmm0
 ; X64-NEXT:    retq
   %neg = fneg float %y
   %sub = call float @llvm.experimental.constrained.fsub.f32(float %x, float %neg, metadata!"round.dynamic", metadata!"fpexcept.strict")
@@ -48,8 +50,9 @@ define double @fneg_strict_fsub_to_strict_fadd_d(double %x, double %y) nounwind
 ;
 ; X64-LABEL: fneg_strict_fsub_to_strict_fadd_d:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-NEXT:    subsd %xmm1, %xmm0
+; X64-NEXT:    movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    xorpd %xmm1, %xmm2
+; X64-NEXT:    subsd %xmm2, %xmm0
 ; X64-NEXT:    retq
   %neg = fneg double %y
   %sub = call double @llvm.experimental.constrained.fsub.f64(double %x, double %neg, metadata!"round.dynamic", metadata!"fpexcept.strict")
@@ -63,8 +66,9 @@ define float @strict_fsub_fneg_to_strict_fsub(float %x, float %y) nounwind stric
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    subss {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-NEXT:    xorps %xmm0, %xmm1
+; X86-NEXT:    movss %xmm1, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    wait
 ; X86-NEXT:    popl %eax
@@ -73,7 +77,8 @@ define float @strict_fsub_fneg_to_strict_fsub(float %x, float %y) nounwind stric
 ; X64-LABEL: strict_fsub_fneg_to_strict_fsub:
 ; X64:       # %bb.0:
 ; X64-NEXT:    subss %xmm1, %xmm0
-; X64-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    xorps %xmm1, %xmm0
 ; X64-NEXT:    retq
   %sub = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata!"round.dynamic", metadata!"fpexcept.strict")
   %neg = fneg float %sub
@@ -101,7 +106,8 @@ define double @strict_fsub_fneg_to_strict_fsub_d(double %x, double %y) nounwind
 ; X64-LABEL: strict_fsub_fneg_to_strict_fsub_d:
 ; X64:       # %bb.0:
 ; X64-NEXT:    subsd %xmm1, %xmm0
-; X64-NEXT:    xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
+; X64-NEXT:    xorpd %xmm1, %xmm0
 ; X64-NEXT:    retq
   %sub = call double @llvm.experimental.constrained.fsub.f64(double %x, double %y, metadata!"round.dynamic", metadata!"fpexcept.strict")
   %neg = fneg double %sub
diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll
index 76183ac5f8fa3e..8d227493f3bbb8 100644
--- a/llvm/test/CodeGen/X86/subvector-broadcast.ll
+++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll
@@ -1647,13 +1647,13 @@ define <4 x double> @broadcast_v4f64_v2f64_4u61(ptr %vp, <4 x double> %default)
 ; X86-LABEL: broadcast_v4f64_v2f64_4u61:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vinsertf128 $1, (%eax), %ymm0, %ymm1
+; X86-NEXT:    vbroadcastsd 8(%eax), %ymm1
 ; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: broadcast_v4f64_v2f64_4u61:
 ; X64:       # %bb.0:
-; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm1
+; X64-NEXT:    vbroadcastsd 8(%rdi), %ymm1
 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; X64-NEXT:    retq
   %vec = load <2 x double>, ptr %vp
diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll
index 953a0d65c5386c..029c76a9f3ad37 100644
--- a/llvm/test/CodeGen/X86/test-shrink-bug.ll
+++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll
@@ -66,8 +66,11 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) {
 ; CHECK-X64-LABEL: fail:
 ; CHECK-X64:       # %bb.0:
 ; CHECK-X64-NEXT:    pslld $8, %xmm0
-; CHECK-X64-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-X64-NEXT:    pextrw $1, %xmm0, %eax
+; CHECK-X64-NEXT:    movzbl {{\.?LCPI[0-9]+_[0-9]+}}+2(%rip), %eax
+; CHECK-X64-NEXT:    movd %eax, %xmm1
+; CHECK-X64-NEXT:    pslld $16, %xmm1
+; CHECK-X64-NEXT:    pcmpeqb %xmm0, %xmm1
+; CHECK-X64-NEXT:    pextrw $1, %xmm1, %eax
 ; CHECK-X64-NEXT:    xorb $1, %al
 ; CHECK-X64-NEXT:    testl $263, %edi # imm = 0x107
 ; CHECK-X64-NEXT:    setne %cl
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
index 5ea991f85523ea..e78b5e19c5dc2f 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
@@ -663,70 +663,20 @@ define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind {
 }
 
 define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-SKX-LABEL: transform_VUNPCKLPDrm:
-; CHECK-SKX:       # %bb.0:
-; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-SKX-NEXT:    retq
-;
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-V4-LABEL: transform_VUNPCKLPDrm:
-; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-V4-NEXT:    retq
-;
-; CHECK-AVX512-LABEL: transform_VUNPCKLPDrm:
-; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-AVX512-NEXT:    retq
-;
-; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrm:
-; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-ZNVER4-NEXT:    retq
+; CHECK-LABEL: transform_VUNPCKLPDrm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; CHECK-NEXT:    retq
   %b = load <4 x float>, ptr %pb
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x float> %shufp
 }
 
 define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-SKX-LABEL: transform_VUNPCKHPDrm:
-; CHECK-SKX:       # %bb.0:
-; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-SKX-NEXT:    retq
-;
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-V4-LABEL: transform_VUNPCKHPDrm:
-; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-V4-NEXT:    retq
-;
-; CHECK-AVX512-LABEL: transform_VUNPCKHPDrm:
-; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-AVX512-NEXT:    retq
-;
-; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrm:
-; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-ZNVER4-NEXT:    retq
+; CHECK-LABEL: transform_VUNPCKHPDrm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %b = load <4 x float>, ptr %pb
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x float> %shufp
@@ -848,37 +798,43 @@ define <2 x double> @transform_VUNPCKLPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_
 ; CHECK-SKX-LABEL: transform_VUNPCKLPDrmkz:
 ; CHECK-SKX:       # %bb.0:
 ; CHECK-SKX-NEXT:    kmovd %esi, %k1
-; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-SKX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-SKX-NEXT:    retq
 ;
 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz:
 ; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz:
 ; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
 ; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKLPDrmkz:
 ; CHECK-V4:       # %bb.0:
 ; CHECK-V4-NEXT:    kmovd %esi, %k1
-; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-V4-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmkz:
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    kmovd %esi, %k1
-; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-AVX512-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmkz:
 ; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
-; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-ZNVER4-NEXT:    retq
   %mask = bitcast i2 %mask_int to <2 x i1>
   %b = load <2 x double>, ptr %pb
@@ -888,41 +844,11 @@ define <2 x double> @transform_VUNPCKLPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_
 }
 
 define <2 x double> @transform_VUNPCKHPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_int) nounwind {
-; CHECK-SKX-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK-SKX:       # %bb.0:
-; CHECK-SKX-NEXT:    kmovd %esi, %k1
-; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-SKX-NEXT:    retq
-;
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-V4-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    kmovd %esi, %k1
-; CHECK-V4-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-V4-NEXT:    retq
-;
-; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    kmovd %esi, %k1
-; CHECK-AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-AVX512-NEXT:    retq
-;
-; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmkz:
-; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
-; CHECK-ZNVER4-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-ZNVER4-NEXT:    retq
+; CHECK-LABEL: transform_VUNPCKHPDrmkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %mask = bitcast i2 %mask_int to <2 x i1>
   %b = load <2 x double>, ptr %pb
   %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
@@ -1060,42 +986,48 @@ define <2 x double> @transform_VUNPCKLPDrmk(<2 x double> %a, ptr %pb, <2 x doubl
 ; CHECK-SKX-LABEL: transform_VUNPCKLPDrmk:
 ; CHECK-SKX:       # %bb.0:
 ; CHECK-SKX-NEXT:    kmovd %esi, %k1
-; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-SKX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0]
 ; CHECK-SKX-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-SKX-NEXT:    retq
 ;
 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk:
 ; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0]
 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk:
 ; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
 ; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-ICX-BYPASS-DELAY-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0]
 ; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: transform_VUNPCKLPDrmk:
 ; CHECK-V4:       # %bb.0:
 ; CHECK-V4-NEXT:    kmovd %esi, %k1
-; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-V4-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-V4-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0]
 ; CHECK-V4-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmk:
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    kmovd %esi, %k1
-; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-AVX512-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-AVX512-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0]
 ; CHECK-AVX512-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-AVX512-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmk:
 ; CHECK-ZNVER4:       # %bb.0:
+; CHECK-ZNVER4-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
-; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-ZNVER4-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0]
 ; CHECK-ZNVER4-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-ZNVER4-NEXT:    retq
   %mask = bitcast i2 %mask_int to <2 x i1>
@@ -1106,47 +1038,12 @@ define <2 x double> @transform_VUNPCKLPDrmk(<2 x double> %a, ptr %pb, <2 x doubl
 }
 
 define <2 x double> @transform_VUNPCKHPDrmk(<2 x double> %a, ptr %pb, <2 x double> %c, i2 %mask_int) nounwind {
-; CHECK-SKX-LABEL: transform_VUNPCKHPDrmk:
-; CHECK-SKX:       # %bb.0:
-; CHECK-SKX-NEXT:    kmovd %esi, %k1
-; CHECK-SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-SKX-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-SKX-NEXT:    retq
-;
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    kmovd %esi, %k1
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-V4-LABEL: transform_VUNPCKHPDrmk:
-; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    kmovd %esi, %k1
-; CHECK-V4-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-V4-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-V4-NEXT:    retq
-;
-; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmk:
-; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    kmovd %esi, %k1
-; CHECK-AVX512-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-AVX512-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-AVX512-NEXT:    retq
-;
-; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmk:
-; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    kmovd %esi, %k1
-; CHECK-ZNVER4-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
-; CHECK-ZNVER4-NEXT:    vmovapd %xmm1, %xmm0
-; CHECK-ZNVER4-NEXT:    retq
+; CHECK-LABEL: transform_VUNPCKHPDrmk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %mask = bitcast i2 %mask_int to <2 x i1>
   %b = load <2 x double>, ptr %pb
   %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
index 6940c33c9d327d..ff02527b05d2dd 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll
@@ -163,30 +163,10 @@ define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind {
 }
 
 define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind {
-; CHECK-AVX2-LABEL: transform_VUNPCKLPDrm:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
-; CHECK-SNB-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
-; CHECK-SNB-BYPASS-DELAY:       # %bb.0:
-; CHECK-SNB-BYPASS-DELAY-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-SNB-BYPASS-DELAY-NEXT:    retq
+; CHECK-LABEL: transform_VUNPCKLPDrm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; CHECK-NEXT:    retq
   %b = load <4 x float>, ptr %pb
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x float> %shufp
@@ -195,34 +175,30 @@ define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind {
 define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind {
 ; CHECK-AVX2-LABEL: transform_VUNPCKHPDrm:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; CHECK-AVX2-NEXT:    retq
 ;
-; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
-; CHECK-ICX-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-ICX-NO-BYPASS-DELAY-NEXT:    retq
-;
-; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
-; CHECK-ICX-BYPASS-DELAY:       # %bb.0:
-; CHECK-ICX-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
-; CHECK-ICX-BYPASS-DELAY-NEXT:    retq
+; CHECK-ICX-LABEL: transform_VUNPCKHPDrm:
+; CHECK-ICX:       # %bb.0:
+; CHECK-ICX-NEXT:    vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0
+; CHECK-ICX-NEXT:    retq
 ;
 ; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
 ; CHECK-SNB-NO-BYPASS-DELAY:       # %bb.0:
-; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; CHECK-SNB-NO-BYPASS-DELAY-NEXT:    retq
 ;
 ; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
 ; CHECK-SNB-BYPASS-DELAY:       # %bb.0:
-; CHECK-SNB-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-SNB-BYPASS-DELAY-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-SNB-BYPASS-DELAY-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; CHECK-SNB-BYPASS-DELAY-NEXT:    retq
   %b = load <4 x float>, ptr %pb
   %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x float> %shufp
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
-; CHECK-ICX: {{.*}}
 ; CHECK-SKL: {{.*}}
 ; CHECK-V3: {{.*}}
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
index 36094fe56d5774..b36592bf90ae79 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
@@ -243,7 +243,8 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
-; CHECK-AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vmovq {{.*#+}} xmm1 = [9223372036854775808,0]
+; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    movabsq $-3074457345618258603, %rax # imm = 0xD555555555555555
 ; CHECK-AVX1-NEXT:    vmovq %rax, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
@@ -262,7 +263,8 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
 ; CHECK-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
-; CHECK-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vmovq {{.*#+}} xmm1 = [9223372036854775808,0]
+; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    movabsq $-3074457345618258603, %rax # imm = 0xD555555555555555
 ; CHECK-AVX2-NEXT:    vmovq %rax, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll
index 91743898545ee1..ace118ee17fada 100644
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@@ -34,18 +34,18 @@ define <4 x float> @t2(ptr %P) nounwind {
 ; X86-LABEL: t2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorps %xmm0, %xmm0
-; X86-NEXT:    xorps %xmm1, %xmm1
-; X86-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movq {{.*#+}} xmm1 = xmm0[0],zero
+; X86-NEXT:    pxor %xmm0, %xmm0
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t2:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorps %xmm0, %xmm0
-; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; X64-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movq {{.*#+}} xmm1 = xmm0[0],zero
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
 ; X64-NEXT:    retq
   %tmp1 = load <4 x float>, ptr %P
   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
@@ -56,14 +56,12 @@ define <4 x float> @t3(ptr %P) nounwind {
 ; X86-LABEL: t3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorps %xmm0, %xmm0
-; X86-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t3:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorps %xmm0, %xmm0
-; X64-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    retq
   %tmp1 = load <4 x float>, ptr %P
   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
@@ -74,18 +72,12 @@ define <4 x float> @t4(ptr %P) nounwind {
 ; X86-LABEL: t4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorps %xmm1, %xmm1
-; X86-NEXT:    xorps %xmm0, %xmm0
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0]
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t4:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    xorps %xmm0, %xmm0
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0]
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:    retq
   %tmp1 = load <4 x float>, ptr %P
   %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
@@ -96,27 +88,13 @@ define <4 x float> @t4_under_aligned(ptr %P) nounwind {
 ; X86-LABEL: t4_under_aligned:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movups (%eax), %xmm0
-; X86-NEXT:    xorps %xmm1, %xmm1
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    retl
 ;
-; ALIGN-LABEL: t4_under_aligned:
-; ALIGN:       # %bb.0:
-; ALIGN-NEXT:    movups (%rdi), %xmm0
-; ALIGN-NEXT:    xorps %xmm1, %xmm1
-; ALIGN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
-; ALIGN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
-; ALIGN-NEXT:    retq
-;
-; UNALIGN-LABEL: t4_under_aligned:
-; UNALIGN:       # %bb.0:
-; UNALIGN-NEXT:    xorps %xmm1, %xmm1
-; UNALIGN-NEXT:    xorps %xmm0, %xmm0
-; UNALIGN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0]
-; UNALIGN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
-; UNALIGN-NEXT:    retq
+; X64-LABEL: t4_under_aligned:
+; X64:       # %bb.0:
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    retq
   %tmp1 = load <4 x float>, ptr %P, align 4
   %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
   ret <4 x float> %tmp2
@@ -191,3 +169,6 @@ define <16 x i8> @t9(<16 x i8> %x) nounwind {
   %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 undef, i32 undef>
   ret <16 x i8> %s
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALIGN: {{.*}}
+; UNALIGN: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index af841cf38b24ae..c524e8956f7908 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -4119,26 +4119,14 @@ define <8 x float> @sitofp_load_8i8_to_8f32(ptr%a) {
 define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) {
 ; SSE2-LABEL: uitofp_load_4i64_to_4f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq 24(%rdi), %rax
-; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    movq 24(%rdi), %rcx
+; SSE2-NEXT:    testq %rcx, %rcx
 ; SSE2-NEXT:    js .LBB83_1
 ; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB83_3
-; SSE2-NEXT:  .LBB83_1:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm0
-; SSE2-NEXT:  .LBB83_3:
-; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm0
 ; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB83_4
-; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    jmp .LBB83_6
+; SSE2-NEXT:    jns .LBB83_5
 ; SSE2-NEXT:  .LBB83_4:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
@@ -4146,6 +4134,18 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) {
 ; SSE2-NEXT:    orq %rcx, %rax
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:    jmp .LBB83_6
+; SSE2-NEXT:  .LBB83_1:
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    shrq %rdx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm0
+; SSE2-NEXT:    addss %xmm0, %xmm0
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB83_4
+; SSE2-NEXT:  .LBB83_5:
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:  .LBB83_6:
 ; SSE2-NEXT:    movq (%rdi), %rax
 ; SSE2-NEXT:    movq 8(%rdi), %rcx
@@ -4448,26 +4448,14 @@ define <4 x float> @uitofp_load_4i8_to_4f32(ptr%a) {
 define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-LABEL: uitofp_load_8i64_to_8f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq 24(%rdi), %rax
-; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    movq 24(%rdi), %rcx
+; SSE2-NEXT:    testq %rcx, %rcx
 ; SSE2-NEXT:    js .LBB87_1
 ; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    jmp .LBB87_3
-; SSE2-NEXT:  .LBB87_1:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    addss %xmm2, %xmm2
-; SSE2-NEXT:  .LBB87_3:
-; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
 ; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB87_4
-; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    jmp .LBB87_6
+; SSE2-NEXT:    jns .LBB87_5
 ; SSE2-NEXT:  .LBB87_4:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
@@ -4475,6 +4463,18 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-NEXT:    orq %rcx, %rax
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:    jmp .LBB87_6
+; SSE2-NEXT:  .LBB87_1:
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    shrq %rdx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
+; SSE2-NEXT:    addss %xmm2, %xmm2
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB87_4
+; SSE2-NEXT:  .LBB87_5:
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:  .LBB87_6:
 ; SSE2-NEXT:    movq (%rdi), %rax
 ; SSE2-NEXT:    movq 8(%rdi), %rcx
@@ -4504,26 +4504,14 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-NEXT:  .LBB87_11:
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:  .LBB87_12:
-; SSE2-NEXT:    movq 56(%rdi), %rax
-; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    movq 48(%rdi), %rax
+; SSE2-NEXT:    movq 56(%rdi), %rcx
+; SSE2-NEXT:    testq %rcx, %rcx
 ; SSE2-NEXT:    js .LBB87_13
 ; SSE2-NEXT:  # %bb.14:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm5
-; SSE2-NEXT:    jmp .LBB87_15
-; SSE2-NEXT:  .LBB87_13:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm5
-; SSE2-NEXT:    addss %xmm5, %xmm5
-; SSE2-NEXT:  .LBB87_15:
-; SSE2-NEXT:    movq 48(%rdi), %rax
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm5
 ; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB87_16
-; SSE2-NEXT:  # %bb.17:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
-; SSE2-NEXT:    jmp .LBB87_18
+; SSE2-NEXT:    jns .LBB87_17
 ; SSE2-NEXT:  .LBB87_16:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
@@ -4531,28 +4519,40 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-NEXT:    orq %rcx, %rax
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
 ; SSE2-NEXT:    addss %xmm4, %xmm4
+; SSE2-NEXT:    jmp .LBB87_18
+; SSE2-NEXT:  .LBB87_13:
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    shrq %rdx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm5
+; SSE2-NEXT:    addss %xmm5, %xmm5
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB87_16
+; SSE2-NEXT:  .LBB87_17:
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
 ; SSE2-NEXT:  .LBB87_18:
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    movq 40(%rdi), %rax
-; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    movq 32(%rdi), %rax
+; SSE2-NEXT:    movq 40(%rdi), %rcx
+; SSE2-NEXT:    testq %rcx, %rcx
 ; SSE2-NEXT:    js .LBB87_19
 ; SSE2-NEXT:  # %bb.20:
 ; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
 ; SSE2-NEXT:    jmp .LBB87_21
 ; SSE2-NEXT:  .LBB87_19:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    shrq %rdx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    orq %rdx, %rcx
 ; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
 ; SSE2-NEXT:    addss %xmm2, %xmm2
 ; SSE2-NEXT:  .LBB87_21:
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSE2-NEXT:    movq 32(%rdi), %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_22
 ; SSE2-NEXT:  # %bb.23:
diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll
index 2ab00ea96ada1f..5f8da88eb354d5 100644
--- a/llvm/test/CodeGen/X86/vec_shift5.ll
+++ b/llvm/test/CodeGen/X86/vec_shift5.ll
@@ -215,7 +215,7 @@ define <4 x i32> @test18(<4 x i32> %a0, ptr %dummy) {
 define <4 x i32> @extelt0_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y){
 ; CHECK-LABEL: extelt0_sub_pslli_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
+; CHECK-NEXT:    movd {{.*#+}} xmm2 = [32,0,0,0]
 ; CHECK-NEXT:    psubd %xmm1, %xmm2
 ; CHECK-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
@@ -228,23 +228,15 @@ define <4 x i32> @extelt0_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y){
 }
 
 define <4 x i32> @extelt1_add_psrli_v4i32(<4 x i32> %x, <4 x i32> %y){
-; X86-LABEL: extelt1_add_psrli_v4i32:
-; X86:       # %bb.0:
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT:    xorps %xmm2, %xmm2
-; X86-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X86-NEXT:    psrld %xmm2, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: extelt1_add_psrli_v4i32:
-; X64:       # %bb.0:
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X64-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-NEXT:    xorps %xmm2, %xmm2
-; X64-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X64-NEXT:    psrld %xmm2, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: extelt1_add_psrli_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; CHECK-NEXT:    movd {{.*#+}} xmm2 = [3,0,0,0]
+; CHECK-NEXT:    paddd %xmm1, %xmm2
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; CHECK-NEXT:    psrld %xmm1, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %ext = extractelement <4 x i32> %y, i64 1
   %bo = add i32 %ext, 3
   %r = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 %bo)
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 5dcf19013f0b7c..45c3b73a9948c8 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -54,8 +54,9 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ;
 ; XOP-LABEL: test_bitreverse_i8:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vmovd %edi, %xmm0
-; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vmovd %edi, %xmm1
+; XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    vmovd %xmm0, %eax
 ; XOP-NEXT:    # kill: def $al killed $al killed $eax
 ; XOP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll
index 9c80720ae921a8..818405349b7161 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll
@@ -1,30 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -O3 -mtriple=x86_64-pc-linux -stop-after=finalize-isel < %s | FileCheck %s
 
 define <1 x float> @constrained_vector_fadd_v1f32() #0 {
-; CHECK-LABEL: name: constrained_vector_fadd_v1f32
-; CHECK: [[MOVSSrm_alt:%[0-9]+]]:fr32 = MOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool)
-; CHECK: [[ADDSSrm:%[0-9]+]]:fr32 = ADDSSrm [[MOVSSrm_alt]], $rip, 1, $noreg, %const.1, $noreg, implicit $mxcsr :: (load (s32) from constant-pool)
-; CHECK: $xmm0 = COPY [[ADDSSrm]]
-; CHECK: RET 0, $xmm0
 entry:
   %add = call <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float> <float 0x7FF0000000000000>, <1 x float> <float 1.0>, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret <1 x float> %add
 }
 
 define <3 x float> @constrained_vector_fadd_v3f32() #0 {
-; CHECK-LABEL: name: constrained_vector_fadd_v3f32
-; CHECK: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS
-; CHECK: [[MOVSSrm_alt:%[0-9]+]]:fr32 = MOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool)
-; CHECK: [[ADDSSrr:%[0-9]+]]:fr32 = ADDSSrr [[MOVSSrm_alt]], killed [[FsFLD0SS]], implicit $mxcsr
-; CHECK: [[ADDSSrm:%[0-9]+]]:fr32 = ADDSSrm [[MOVSSrm_alt]], $rip, 1, $noreg, %const.1, $noreg, implicit $mxcsr :: (load (s32) from constant-pool)
-; CHECK: [[ADDSSrm1:%[0-9]+]]:fr32 = ADDSSrm [[MOVSSrm_alt]], $rip, 1, $noreg, %const.2, $noreg, implicit $mxcsr :: (load (s32) from constant-pool)
-; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY [[ADDSSrm1]]
-; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY [[ADDSSrm]]
-; CHECK: [[UNPCKLPSrr:%[0-9]+]]:vr128 = UNPCKLPSrr [[COPY1]], killed [[COPY]]
-; CHECK: [[COPY2:%[0-9]+]]:vr128 = COPY [[ADDSSrr]]
-; CHECK: [[UNPCKLPDrr:%[0-9]+]]:vr128 = UNPCKLPDrr [[UNPCKLPSrr]], killed [[COPY2]]
-; CHECK: $xmm0 = COPY [[UNPCKLPDrr]]
-; CHECK: RET 0, $xmm0
 entry:
   %add = call <3 x float> @llvm.experimental.constrained.fadd.v3f32(
            <3 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000,
@@ -36,13 +19,6 @@ entry:
 }
 
 define <4 x double> @constrained_vector_fadd_v4f64() #0 {
-; CHECK-LABEL: name: constrained_vector_fadd_v4f64
-; CHECK: [[MOVAPDrm:%[0-9]+]]:vr128 = MOVAPDrm $rip, 1, $noreg, %const.0, $noreg :: (load (s128) from constant-pool)
-; CHECK: [[ADDPDrm:%[0-9]+]]:vr128 = ADDPDrm [[MOVAPDrm]], $rip, 1, $noreg, %const.1, $noreg, implicit $mxcsr :: (load (s128) from constant-pool)
-; CHECK: [[ADDPDrm1:%[0-9]+]]:vr128 = ADDPDrm [[MOVAPDrm]], $rip, 1, $noreg, %const.2, $noreg, implicit $mxcsr :: (load (s128) from constant-pool)
-; CHECK: $xmm0 = COPY [[ADDPDrm1]]
-; CHECK: $xmm1 = COPY [[ADDPDrm]]
-; CHECK: RET 0, $xmm0, $xmm1
 entry:
   %add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64(
            <4 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
@@ -59,3 +35,5 @@ attributes #0 = { strictfp }
 declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 49062eaef31887..f25267f3dbd53c 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -6319,31 +6319,28 @@ define <3 x double> @constrained_vector_round_v3f64_var(ptr %a) #0 {
 ;
 ; AVX-LABEL: constrained_vector_round_v3f64_var:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    subq $48, %rsp
-; AVX-NEXT:    .cfi_def_cfa_offset 64
-; AVX-NEXT:    .cfi_offset %rbx, -16
-; AVX-NEXT:    movq %rdi, %rbx
+; AVX-NEXT:    subq $72, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 80
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    callq round at PLT
-; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; AVX-NEXT:    # xmm0 = mem[0],zero
 ; AVX-NEXT:    callq round at PLT
-; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; AVX-NEXT:    # xmm0 = mem[0],zero
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq round at PLT
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    addq $48, %rsp
-; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    addq $72, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
 ; AVX-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 193e570c5f9a87..a41f0366882152 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -999,17 +999,19 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX1-LABEL: splatvar_funnnel_v16i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
-; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm5
-; AVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsllw %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -1087,17 +1089,19 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; XOPAVX1-NEXT:    vpsrlw $1, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpor %xmm5, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; XOPAVX1-NEXT:    vpsllw %xmm4, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; XOPAVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrlw %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 1d807fa85ddc5c..d1a77c0b543ab0 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -544,6 +544,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
@@ -562,6 +564,8 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
@@ -622,6 +626,8 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
+; AVX512F-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6
+; AVX512F-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsllw %xmm2, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
@@ -641,24 +647,25 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 & xmm2 & mem
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
-; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
-; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 9c259ed38321d0..58fb5ebc1d6128 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1034,6 +1034,8 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6
+; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
@@ -1123,6 +1125,8 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
+; XOPAVX1-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6
+; XOPAVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOPAVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 665223167fbb4d..9c4c1ff7893960 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -546,6 +546,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
@@ -564,6 +566,8 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
@@ -626,6 +630,8 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
+; AVX512F-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6
+; AVX512F-NEXT:    vpand %xmm6, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -646,25 +652,26 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 & xmm2 & mem
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
-; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
-; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
-; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand %ymm6, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
index eafee9e65345f3..07d15b834452a8 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
@@ -216,25 +216,15 @@ define double @test_v2f64(<2 x double> %a0) {
 }
 
 define double @test_v3f64(<3 x double> %a0) {
-; SSE2-LABEL: test_v3f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[0],mem[1]
-; SSE2-NEXT:    maxpd %xmm2, %xmm0
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    maxsd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v3f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT:    blendpd {{.*#+}} xmm2 = xmm2[0],mem[1]
-; SSE41-NEXT:    maxpd %xmm2, %xmm0
-; SSE41-NEXT:    movapd %xmm0, %xmm1
-; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT:    maxsd %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v3f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE-NEXT:    maxpd %xmm2, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    maxsd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v3f64:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
index 5ae9e552d0dcda..c21959e2fbabed 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
@@ -635,7 +635,7 @@ define double @test_v3f64(<3 x double> %a0) {
 ; SSE2-LABEL: test_v3f64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[0],mem[1]
+; SSE2-NEXT:    movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
 ; SSE2-NEXT:    movapd %xmm2, %xmm1
 ; SSE2-NEXT:    minpd %xmm0, %xmm1
 ; SSE2-NEXT:    cmpunordpd %xmm0, %xmm0
@@ -656,7 +656,7 @@ define double @test_v3f64(<3 x double> %a0) {
 ; SSE41-LABEL: test_v3f64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT:    blendpd {{.*#+}} xmm2 = xmm2[0],mem[1]
+; SSE41-NEXT:    movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
 ; SSE41-NEXT:    movapd %xmm2, %xmm1
 ; SSE41-NEXT:    minpd %xmm0, %xmm1
 ; SSE41-NEXT:    cmpunordpd %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index b114cba14cb6c7..fb5df6933ccf4f 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -660,29 +660,19 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 
 define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
-; SSE2-LABEL: splatvar_rotate_v2i64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
-; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psllq %xmm1, %xmm3
-; SSE2-NEXT:    psrlq %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm3, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: splatvar_rotate_v2i64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm2 = [64,64]
-; SSE41-NEXT:    psubq %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psllq %xmm1, %xmm3
-; SSE41-NEXT:    psrlq %xmm2, %xmm0
-; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: splatvar_rotate_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq {{.*#+}} xmm2 = [64,0]
+; SSE-NEXT:    psubq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psllq %xmm1, %xmm3
+; SSE-NEXT:    psrlq %xmm2, %xmm0
+; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_rotate_v2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [64,64]
+; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [64,0]
 ; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
 ; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
 ; AVX-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 86c4d79a28c891..49ad50a9945153 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -517,7 +517,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: splatvar_rotate_v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [64,64]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [64,0]
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpsllq %xmm1, %xmm3, %xmm4
@@ -532,7 +532,7 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-LABEL: splatvar_rotate_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [64,64]
+; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = [64,0]
 ; AVX2-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 54056461bff8ce..4373620d130ebf 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -936,17 +936,19 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE-LABEL: splatvar_modulo_shift_v2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE-NEXT:    psrlq %xmm1, %xmm2
-; SSE-NEXT:    psrlq %xmm1, %xmm0
-; SSE-NEXT:    pxor %xmm2, %xmm0
-; SSE-NEXT:    psubq %xmm2, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm2 = [63,0]
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; SSE-NEXT:    psrlq %xmm2, %xmm1
+; SSE-NEXT:    psrlq %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubq %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX1-NEXT:    # xmm2 = mem[0,0]
 ; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
@@ -957,7 +959,8 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi
 ;
 ; AVX2-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
@@ -967,7 +970,8 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
@@ -976,8 +980,9 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v2i64:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOPAVX2-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
@@ -986,7 +991,8 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi
 ; AVX512-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512-NEXT:    vzeroupper
@@ -1160,17 +1166,19 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
@@ -1178,9 +1186,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQ-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1188,9 +1197,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512BW-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
 ; AVX512BW-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -1199,9 +1209,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512DQVL:       # %bb.0:
-; AVX512DQVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512DQVL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQVL-NEXT:    vzeroupper
@@ -1209,9 +1220,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
 ; AVX512BWVL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
 ; AVX512BWVL-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
index abd81a0e9f99a0..5afb48bea3f503 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -1010,7 +1010,8 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX1-NEXT:    # xmm2 = mem[0,0]
 ; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
@@ -1026,7 +1027,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; AVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
@@ -1036,7 +1038,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOPAVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
@@ -1048,7 +1051,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
@@ -1059,7 +1063,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ; AVX512-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512-NEXT:    retq
@@ -1265,9 +1270,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vpshab %xmm1, %xmm2, %xmm2
@@ -1277,8 +1283,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
@@ -1304,9 +1311,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
 ; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -1327,9 +1335,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
-; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
 ; AVX512BWVL-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index 74dbee5e5d2ca7..67ff078014e024 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -233,7 +233,8 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: splatvar_modulo_shift_v8i64:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; ALL-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; ALL-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 467c1574180da1..1efbf0d0f0ca4c 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -771,25 +771,29 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE-LABEL: splatvar_modulo_shift_v2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    psrlq %xmm1, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm2 = [63,0]
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    psrlq %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_modulo_shift_v2i64:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOP-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -941,17 +945,19 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
@@ -959,9 +965,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQ-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
@@ -969,9 +976,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512BW-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -980,9 +988,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512DQVL:       # %bb.0:
-; AVX512DQVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQVL-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQVL-NEXT:    vzeroupper
@@ -990,9 +999,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BWVL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
 ; AVX512BWVL-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index ca303b4c7ebf67..64db9c6d33f9bb 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -823,8 +823,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -832,14 +833,16 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; AVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -847,13 +850,15 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
@@ -1037,9 +1042,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
@@ -1049,8 +1055,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
@@ -1072,9 +1079,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -1092,9 +1100,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BWVL-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index 103d5702fb93a1..6640dfb13f4d1b 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -188,7 +188,8 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: splatvar_modulo_shift_v8i64:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; ALL-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; ALL-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 4dda9ff09cc62d..8408179ebee076 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -678,25 +678,29 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE-LABEL: splatvar_modulo_shift_v2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    psllq %xmm1, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm2 = [63,0]
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    psllq %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_modulo_shift_v2i64:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOP-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOP-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_modulo_shift_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -848,24 +852,27 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQ-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQ-NEXT:    vpslld %xmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
@@ -873,9 +880,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512BW-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BW-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -884,9 +892,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512DQVL:       # %bb.0:
-; AVX512DQVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512DQVL-NEXT:    vpslld %xmm1, %zmm0, %zmm0
 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQVL-NEXT:    vzeroupper
@@ -894,9 +903,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512BWVL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
 ; AVX512BWVL-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index c80f24ad577730..7f0e3388944e03 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -748,8 +748,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vpsllq %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -757,14 +758,16 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; AVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsllq %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -772,13 +775,15 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_modulo_shift_v4i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
@@ -962,9 +967,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
@@ -973,9 +979,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX2:       # %bb.0:
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; XOPAVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -993,9 +1000,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
@@ -1012,9 +1020,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BWVL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BWVL-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index a42056be895e7a..540bab0cdc33a6 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -181,7 +181,8 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: splatvar_modulo_shift_v8i64:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; ALL-NEXT:    vmovq {{.*#+}} xmm2 = [63,0]
+; ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; ALL-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 1d389f98172294..fdab5b797f3faa 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -1222,7 +1222,7 @@ define <2 x double> @insert_dup_mem_v2f64(ptr %ptr) {
 define <2 x double> @insert_dup_mem128_v2f64(ptr %ptr) nounwind {
 ; SSE2-LABEL: insert_dup_mem128_v2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps (%rdi), %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE2-NEXT:    retq
 ;
@@ -1308,7 +1308,7 @@ define <2 x double> @shuffle_mem_v2f64_02(<2 x double> %a, ptr %pb) {
 ;
 ; AVX-LABEL: shuffle_mem_v2f64_02:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; AVX-NEXT:    retq
   %b = load <2 x double>, ptr %pb, align 1
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
@@ -1316,30 +1316,14 @@ define <2 x double> @shuffle_mem_v2f64_02(<2 x double> %a, ptr %pb) {
 }
 
 define <2 x double> @shuffle_mem_v2f64_21(<2 x double> %a, ptr %pb) {
-; SSE2-LABEL: shuffle_mem_v2f64_21:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; SSE2-NEXT:    retq
-;
-; SSE3-LABEL: shuffle_mem_v2f64_21:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; SSE3-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_mem_v2f64_21:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_mem_v2f64_21:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movups (%rdi), %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_mem_v2f64_21:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_mem_v2f64_21:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX-NEXT:    retq
   %b = load <2 x double>, ptr %pb, align 1
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index a79b109feec72b..428dffcdda576a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -2463,7 +2463,7 @@ define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, ptr %pb) {
 ;
 ; AVX-LABEL: shuffle_mem_v4f32_0145:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; AVX-NEXT:    retq
   %b = load <4 x float>, ptr %pb, align 1
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -2471,30 +2471,14 @@ define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, ptr %pb) {
 }
 
 define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, ptr %pb) {
-; SSE2-LABEL: shuffle_mem_v4f32_4523:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; SSE2-NEXT:    retq
-;
-; SSE3-LABEL: shuffle_mem_v4f32_4523:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; SSE3-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_mem_v4f32_4523:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_mem_v4f32_4523:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movups (%rdi), %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_mem_v4f32_4523:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_mem_v4f32_4523:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX-NEXT:    retq
   %b = load <4 x float>, ptr %pb, align 1
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
@@ -2525,23 +2509,46 @@ define  <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, ptr %a1) {
 }
 
 define  <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, ptr %a1) {
-; SSE-LABEL: shuffle_mem_v4f32_4760:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
-; SSE-NEXT:    retq
+; SSE2-LABEL: shuffle_mem_v4f32_4760:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_mem_v4f32_4760:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_mem_v4f32_4760:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_mem_v4f32_4760:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
+; SSE41-NEXT:    retq
 ;
 ; AVX1OR2-LABEL: shuffle_mem_v4f32_4760:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0]
-; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
+; AVX1OR2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0]
 ; AVX1OR2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_mem_v4f32_4760:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,3,2,4]
-; AVX512VL-NEXT:    vpermt2ps (%rdi), %xmm1, %xmm0
+; AVX512VL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,3,2,4]
+; AVX512VL-NEXT:    vpermt2ps %xmm1, %xmm2, %xmm0
 ; AVX512VL-NEXT:    retq
   %1 = load <4 x float>, ptr %a1
   %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 4, i32 7, i32 6, i32 0>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index ec54b755135829..feefa8fb875e54 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -914,16 +914,15 @@ define void @PR63030(ptr %p0) {
 ;
 ; X64-AVX2-LABEL: PR63030:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; X64-AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = [3,3]
-; X64-AVX2-NEXT:    # xmm1 = mem[0,0]
-; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[1,1,0,0]
-; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; X64-AVX2-NEXT:    vmovaps {{.*#+}} xmm2 = [3,2]
-; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1]
-; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
-; X64-AVX2-NEXT:    vmovaps %ymm0, (%rax)
-; X64-AVX2-NEXT:    vmovaps %ymm1, (%rax)
+; X64-AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [3,3]
+; X64-AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[1,1,0,0]
+; X64-AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; X64-AVX2-NEXT:    vpinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1]
+; X64-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; X64-AVX2-NEXT:    vmovdqa %ymm0, (%rax)
+; X64-AVX2-NEXT:    vmovdqa %ymm1, (%rax)
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 5c035346415b0e..514523efef2a9a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -796,21 +796,26 @@ define <16 x i8> @constant_fold_pshufb_2() {
 define i32 @mask_zzz3_v16i8(<16 x i8> %a0) {
 ; SSSE3-LABEL: mask_zzz3_v16i8:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    movzbl {{\.?LCPI[0-9]+_[0-9]+}}+3(%rip), %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    pslld $24, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
 ; SSSE3-NEXT:    movd %xmm0, %eax
 ; SSSE3-NEXT:    andl $-16777216, %eax # imm = 0xFF000000
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: mask_zzz3_v16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    pinsrb $15, {{\.?LCPI[0-9]+_[0-9]+}}+15(%rip), %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
 ; SSE41-NEXT:    pextrd $3, %xmm0, %eax
 ; SSE41-NEXT:    andl $-16777216, %eax # imm = 0xFF000000
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: mask_zzz3_v16i8:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $15, {{\.?LCPI[0-9]+_[0-9]+}}+15(%rip), %xmm0, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpextrd $3, %xmm0, %eax
 ; AVX-NEXT:    andl $-16777216, %eax # imm = 0xFF000000
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 07c770abc65d6c..a7210cffc80c08 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1749,13 +1749,21 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) {
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_test1c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: combine_test1c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test1c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %A = load <4 x i8>, ptr %a
   %B = load <4 x i8>, ptr %b
   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1810,17 +1818,21 @@ define <4 x i8> @combine_test3c(ptr %a, ptr %b) {
 define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
 ; SSE2-LABEL: combine_test4c:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    andps %xmm0, %xmm2
-; SSE2-NEXT:    andnps %xmm1, %xmm0
-; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    movzbl 1(%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    psllw $8, %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test4c:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movzbl 1(%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    psllw $8, %xmm1
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -1828,20 +1840,32 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
 ;
 ; SSE41-LABEL: combine_test4c:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT:    movzbl 1(%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm1
+; SSE41-NEXT:    psllw $8, %xmm1
 ; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    movss {{.*#+}} xmm0 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_test4c:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: combine_test4c:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    movzbl 1(%rdi), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test4c:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb 1(%rdi), %xmm0
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %A = load <4 x i8>, ptr %a
   %B = load <4 x i8>, ptr %b
   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -2821,16 +2845,16 @@ define <4 x float> @PR30264(<4 x float> %x) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
-; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: PR30264:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSSE3-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
-; SSSE3-NEXT:    movapd %xmm1, %xmm0
+; SSSE3-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: PR30264:
@@ -3051,17 +3075,18 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) {
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    psraw $8, %xmm1
 ; SSE2-NEXT:    pextrw $7, %xmm1, %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    movsbl (%rsi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movsbl (%rdx), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65531,0,0,0]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    movsbl (%rsi), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movsbl (%rdx), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_scalar_to_vector_extract:
@@ -3077,7 +3102,8 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) {
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = [65531,0,0,0]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSSE3-NEXT:    retq
 ;
@@ -3555,14 +3581,15 @@ define void @SpinningCube() {
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
 ; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0]
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u]
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; SSE2-NEXT:    xorps %xmm3, %xmm3
-; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
-; SSE2-NEXT:    addps %xmm3, %xmm1
-; SSE2-NEXT:    movaps %xmm1, (%rax)
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT:    movaps %xmm2, %xmm3
+; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3]
+; SSE2-NEXT:    addps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, (%rax)
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE2-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -3574,14 +3601,15 @@ define void @SpinningCube() {
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
 ; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0]
-; SSSE3-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSSE3-NEXT:    movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u]
-; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; SSSE3-NEXT:    xorps %xmm3, %xmm3
-; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
-; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
-; SSSE3-NEXT:    addps %xmm3, %xmm1
-; SSSE3-NEXT:    movaps %xmm1, (%rax)
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSSE3-NEXT:    movaps %xmm2, %xmm3
+; SSSE3-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3]
+; SSSE3-NEXT:    addps %xmm1, %xmm2
+; SSSE3-NEXT:    movaps %xmm2, (%rax)
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,2]
 ; SSSE3-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 2b89590a0bb419..ecdea22d7b5a4d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -42,7 +42,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0]
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = [18446744073709551615,0]
 ; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
@@ -56,7 +56,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
 ; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
-; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [18446744073709551615,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm2 = [18446744073709551615,0]
 ; AVX512VL-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
@@ -67,7 +67,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
-; VL_BW_DQ-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0]
+; VL_BW_DQ-NEXT:    vmovq {{.*#+}} xmm1 = [18446744073709551615,0]
 ; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
index e83c1e84827737..fea59d96576127 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
@@ -69,7 +69,7 @@ define <64 x i8> @f1(ptr %p0) {
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT:    vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm0 ^ (mem & (ymm2 ^ ymm0))
 ; AVX512F-NEXT:    vmovdqa 80(%rdi), %xmm0
 ; AVX512F-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa 64(%rdi), %xmm4
@@ -83,7 +83,7 @@ define <64 x i8> @f1(ptr %p0) {
 ; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm3
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,5,7,11,13,17,19,23,25,29,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm1 & mem)
 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -214,7 +214,7 @@ define <64 x i8> @f2(ptr %p0) {
 ; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm4
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem)
 ; AVX512F-NEXT:    vmovdqa 96(%rdi), %xmm0
 ; AVX512F-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm4
@@ -228,7 +228,7 @@ define <64 x i8> @f2(ptr %p0) {
 ; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: f2:
@@ -344,7 +344,7 @@ define <64 x i8> @f3(ptr %p0) {
 ; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpmovsxwd {{.*#+}} ymm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0]
-; AVX512F-NEXT:    vpternlogq $216, %ymm5, %ymm2, %ymm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm5 & (ymm0 ^ ymm2))
 ; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm6
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
@@ -369,7 +369,7 @@ define <64 x i8> @f3(ptr %p0) {
 ; AVX512F-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512F-NEXT:    vpternlogq $226, %ymm1, %ymm5, %ymm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm5 & (ymm2 ^ ymm1))
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -497,7 +497,7 @@ define <64 x i8> @f4(ptr %p0) {
 ; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm4
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem)
 ; AVX512F-NEXT:    vmovdqa 96(%rdi), %xmm0
 ; AVX512F-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm4
@@ -511,7 +511,7 @@ define <64 x i8> @f4(ptr %p0) {
 ; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: f4:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
index 0efbe018764d28..ec81ecf6faa857 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
@@ -35,7 +35,7 @@ define <32 x i8> @foo(ptr %x0) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqu 32(%rdi), %xmm0
 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
-; AVX2-NEXT:    vmovdqu 16(%rdi), %xmm2
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
@@ -53,7 +53,7 @@ define <32 x i8> @foo(ptr %x0) {
 ; AVX512F-NEXT:    vmovdqu 32(%rdi), %xmm1
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-NEXT:    vmovdqu 16(%rdi), %xmm2
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,2,3,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero,ymm0[24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
@@ -65,7 +65,7 @@ define <32 x i8> @foo(ptr %x0) {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqu 32(%rdi), %xmm0
 ; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm1
-; AVX512BW-NEXT:    vmovdqu 16(%rdi), %xmm2
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    movl $63488, %eax # imm = 0xF800
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index be6ee8f6899584..494d216dfa0ed4 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -761,19 +761,31 @@ define void @vselect_allzeros_LHS_multiple_use_setcc(<4 x i32> %x, <4 x i32> %y,
 ; This test case previously crashed after r363802, r363850, and r363856 due
 ; any_extend_vector_inreg not being handled by the X86 backend.
 define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) {
-; SSE-LABEL: vselect_any_extend_vector_inreg_crash:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    shll $15, %eax
-; SSE-NEXT:    retq
+; SSE2-LABEL: vselect_any_extend_vector_inreg_crash:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = [49,0,0,0]
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    shll $15, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: vselect_any_extend_vector_inreg_crash:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT:    pinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    andl $1, %eax
+; SSE41-NEXT:    shll $15, %eax
+; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: vselect_any_extend_vector_inreg_crash:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    andl $1, %eax
 ; AVX1-NEXT:    shll $15, %eax
diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll
index 8e47ed67bdcffc..3c8d358413eca8 100644
--- a/llvm/test/CodeGen/X86/widened-broadcast.ll
+++ b/llvm/test/CodeGen/X86/widened-broadcast.ll
@@ -140,23 +140,14 @@ entry:
 define <8 x i16> @load_splat_8i16_8i16_01010101(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_8i16_8i16_01010101:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_8i16_8i16_01010101:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_8i16_8i16_01010101:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastss (%rdi), %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_8i16_8i16_01010101:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastss (%rdi), %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_8i16_8i16_01010101:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <8 x i16>, ptr %ptr
   %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -192,7 +183,8 @@ entry:
 define <16 x i16> @load_splat_16i16_8i16_0101010101010101(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_16i16_8i16_0101010101010101:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
@@ -226,7 +218,8 @@ entry:
 define <16 x i16> @load_splat_16i16_16i16_0101010101010101(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_16i16_16i16_0101010101010101:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
@@ -288,23 +281,14 @@ entry:
 define <16 x i8> @load_splat_16i8_16i8_0123012301230123(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_16i8_16i8_0123012301230123:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_16i8_16i8_0123012301230123:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_16i8_16i8_0123012301230123:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastss (%rdi), %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_16i8_16i8_0123012301230123:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastss (%rdi), %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_16i8_16i8_0123012301230123:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <16 x i8>, ptr %ptr
   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -347,7 +331,8 @@ define <32 x i8> @load_splat_32i8_16i8_01010101010101010101010101010101(ptr %ptr
 ;
 ; AVX1-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -370,7 +355,8 @@ entry:
 define <32 x i8> @load_splat_32i8_16i8_01230123012301230123012301230123(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
@@ -411,7 +397,8 @@ define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(ptr %ptr
 ;
 ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -434,7 +421,8 @@ entry:
 define <32 x i8> @load_splat_32i8_32i8_01230123012301230123012301230123(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
@@ -468,7 +456,7 @@ entry:
 define <4 x float> @load_splat_4f32_8f32_0000(ptr %ptr) nounwind uwtable readnone ssp {
 ; SSE-LABEL: load_splat_4f32_8f32_0000:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 3e76bffb77a665..91831d2326bbbd 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -53,10 +53,12 @@ define <4 x double> @load_factorf64_2(ptr %ptr) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovupd (%rdi), %ymm0
 ; AVX1-NEXT:    vmovupd 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovupd 64(%rdi), %ymm2
-; AVX1-NEXT:    vmovupd 96(%rdi), %ymm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX1-NEXT:    vbroadcastsd 88(%rdi), %ymm2
+; AVX1-NEXT:    vbroadcastsd 120(%rdi), %ymm3
+; AVX1-NEXT:    vmovsd 64(%rdi), %xmm4 # xmm4 = mem[0],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX1-NEXT:    vmovsd 96(%rdi), %xmm5 # xmm5 = mem[0],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm5
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
@@ -88,21 +90,21 @@ define <4 x double> @load_factorf64_2(ptr %ptr) nounwind {
 define <4 x double> @load_factorf64_1(ptr %ptr) nounwind {
 ; AVX1-LABEL: load_factorf64_1:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovups (%rdi), %ymm0
-; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX1-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
-; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT:    vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovsd 64(%rdi), %xmm1 # xmm1 = mem[0],zero
+; AVX1-NEXT:    vmovhps 96(%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-NEXT:    vmovhps 32(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2OR512-LABEL: load_factorf64_1:
 ; AVX2OR512:       # %bb.0:
-; AVX2OR512-NEXT:    vmovupd (%rdi), %ymm0
-; AVX2OR512-NEXT:    vmovupd 32(%rdi), %ymm1
-; AVX2OR512-NEXT:    vperm2f128 $32, 64(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0,1],mem[0,1]
-; AVX2OR512-NEXT:    vperm2f128 $32, 96(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1],mem[0,1]
-; AVX2OR512-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2OR512-NEXT:    vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero
+; AVX2OR512-NEXT:    vmovsd 64(%rdi), %xmm1 # xmm1 = mem[0],zero
+; AVX2OR512-NEXT:    vmovhpd 96(%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
+; AVX2OR512-NEXT:    vmovhpd 32(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
+; AVX2OR512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2OR512-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
 ; AVX2OR512-NEXT:    retq
   %wide.vec = load <16 x double>, ptr %ptr, align 16
@@ -1873,8 +1875,9 @@ define void @splat4_v4i64_load_store(ptr %s, ptr %d) nounwind {
 define <2 x i64> @PR37616(ptr %a0) nounwind {
 ; AVX-LABEL: PR37616:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm0
-; AVX-NEXT:    vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovsd 48(%rdi), %xmm0 # xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd 16(%rdi), %xmm1 # xmm1 = mem[0],zero
+; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX-NEXT:    retq
   %load = load <16 x i64>, ptr %a0, align 128
   %shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> <i32 2, i32 6>
diff --git a/llvm/test/CodeGen/X86/xop-shifts.ll b/llvm/test/CodeGen/X86/xop-shifts.ll
index 83dcf9ce0d1e90..1512a488846a8a 100644
--- a/llvm/test/CodeGen/X86/xop-shifts.ll
+++ b/llvm/test/CodeGen/X86/xop-shifts.ll
@@ -8,9 +8,12 @@
 define <16 x i8> @demandedelts_vpshab(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: demandedelts_vpshab:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vpshab %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %shift = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %shuffle, <16 x i8> %a1)
diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll
index 2bef66825d8c02..f5c879b2011cd1 100644
--- a/llvm/test/CodeGen/X86/xor.ll
+++ b/llvm/test/CodeGen/X86/xor.ll
@@ -405,8 +405,11 @@ define i32 @PR17487(i1 %tobool) {
 ; X64-LIN:       # %bb.0:
 ; X64-LIN-NEXT:    movd %edi, %xmm0
 ; X64-LIN-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; X64-LIN-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-LIN-NEXT:    pextrw $4, %xmm0, %eax
+; X64-LIN-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-LIN-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X64-LIN-NEXT:    pand %xmm0, %xmm1
+; X64-LIN-NEXT:    pextrw $4, %xmm1, %eax
+; X64-LIN-NEXT:    movzbl %al, %eax
 ; X64-LIN-NEXT:    retq
 ;
 ; X64-WIN-LABEL: PR17487:
@@ -414,8 +417,11 @@ define i32 @PR17487(i1 %tobool) {
 ; X64-WIN-NEXT:    movzbl %cl, %eax
 ; X64-WIN-NEXT:    movd %eax, %xmm0
 ; X64-WIN-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; X64-WIN-NEXT:    pand __xmm at 00000000000000010000000000000001(%rip), %xmm0
-; X64-WIN-NEXT:    pextrw $4, %xmm0, %eax
+; X64-WIN-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-WIN-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; X64-WIN-NEXT:    pand %xmm0, %xmm1
+; X64-WIN-NEXT:    pextrw $4, %xmm1, %eax
+; X64-WIN-NEXT:    movzbl %al, %eax
 ; X64-WIN-NEXT:    retq
   %tmp = insertelement <2 x i1> undef, i1 %tobool, i32 1
   %tmp1 = zext <2 x i1> %tmp to <2 x i64>



More information about the llvm-branch-commits mailing list