[llvm] [DAGCombiner][X86] Push bitcast/ext through freeze for loads (PR #163070)

Guy David via llvm-commits llvm-commits at lists.llvm.org
Sun Oct 26 09:40:21 PDT 2025


https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/163070

>From 9373e0b0bb04ecdb9d661f26fb8b14597fc87842 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Tue, 14 Oct 2025 21:10:44 +0300
Subject: [PATCH 1/2] [DAGCombiner][X86] Push bitcast/ext through freeze for
 loads

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  17 +++
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  14 +++
 .../AArch64/freeze-bitcast-ext-load.ll        | 119 ++++++++++++++++++
 .../test/CodeGen/X86/avx10_2_512bf16-arith.ll |   2 +-
 llvm/test/CodeGen/X86/avx10_2bf16-arith.ll    |   4 +-
 llvm/test/CodeGen/X86/avx512-ext.ll           |  32 ++---
 ...ad-of-small-alloca-with-zero-upper-half.ll |  12 +-
 7 files changed, 171 insertions(+), 29 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6bf9008c3d677..06a8c832fe4a2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16944,6 +16944,23 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
     return SDValue();
 
+  // fold: bitcast(freeze(load)) -> freeze(bitcast(load))
+  // fold: sext(freeze(load)) -> freeze(sext(load))
+  // fold: zext(freeze(load)) -> freeze(zext(load))
+  // This allows the conversion to potentially fold into the load.
+  if (N0.getOpcode() == ISD::LOAD && N->hasOneUse()) {
+    SDNode *User = *N->user_begin();
+    unsigned UserOpcode = User->getOpcode();
+    if (UserOpcode == ISD::BITCAST || UserOpcode == ISD::SIGN_EXTEND ||
+        UserOpcode == ISD::ZERO_EXTEND) {
+      SDValue NewConv =
+          DAG.getNode(UserOpcode, SDLoc(User), User->getValueType(0), N0);
+      SDValue FrozenConv = DAG.getFreeze(NewConv);
+      DAG.ReplaceAllUsesWith(User, FrozenConv.getNode());
+      return SDValue(N, 0);
+    }
+  }
+
   // Fold freeze(op(x, ...)) -> op(freeze(x), ...).
   // Try to push freeze through instructions that propagate but don't produce
   // poison as far as possible. If an operand of freeze follows three
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b5f8ee50cba3d..5b677f6692ea6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3448,6 +3448,20 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
   if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
     return false;
 
+  // With low alignment, don't convert integer vectors to large scalar loads,
+  // because otherwise they get broken into many small scalar loads.
+  if (LoadVT.isVector() && LoadVT.isInteger() && !BitcastVT.isVector() &&
+      BitcastVT.isInteger()) {
+    const DataLayout &DL = DAG.getDataLayout();
+    unsigned MinAlign = DL.getPointerSize();
+    // Aligned well, will legalize into a clean sequence of loads.
+    if (MMO.getAlign() >= MinAlign)
+      return true;
+    // Aligned poorly for a large enough scalar.
+    if (BitcastVT.getSizeInBits() > 2 * DL.getPointerSizeInBits())
+      return false;
+  }
+
   // If both types are legal vectors, it's always ok to convert them.
   if (LoadVT.isVector() && BitcastVT.isVector() &&
       isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
diff --git a/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll b/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll
new file mode 100644
index 0000000000000..8124d35b063a7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+define double @test_bitcast_freeze_load(ptr %p) {
+; CHECK-LABEL: test_bitcast_freeze_load:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %v = load <2 x float>, ptr %p
+  %f = freeze <2 x float> %v
+  %b = bitcast <2 x float> %f to double
+  ret double %b
+}
+
+define i32 @test_sext_freeze_load_i8(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrsb w0, [x0]
+; CHECK-NEXT:    ret
+  %v = load i8, ptr %p
+  %f = freeze i8 %v
+  %e = sext i8 %f to i32
+  ret i32 %e
+}
+
+define i64 @test_sext_freeze_load_i32(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    sxtw x0, w8
+; CHECK-NEXT:    ret
+  %v = load i32, ptr %p
+  %f = freeze i32 %v
+  %e = sext i32 %f to i64
+  ret i64 %e
+}
+
+define i64 @test_sext_freeze_load_i16(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrsh x0, [x0]
+; CHECK-NEXT:    ret
+  %v = load i16, ptr %p
+  %f = freeze i16 %v
+  %e = sext i16 %f to i64
+  ret i64 %e
+}
+
+define i32 @test_zext_freeze_load_i8(ptr %p) {
+; CHECK-LABEL: test_zext_freeze_load_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w0, [x0]
+; CHECK-NEXT:    ret
+  %v = load i8, ptr %p
+  %f = freeze i8 %v
+  %e = zext i8 %f to i32
+  ret i32 %e
+}
+
+define i64 @test_zext_freeze_load_i32(ptr %p) {
+; CHECK-LABEL: test_zext_freeze_load_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:    ret
+  %v = load i32, ptr %p
+  %f = freeze i32 %v
+  %e = zext i32 %f to i64
+  ret i64 %e
+}
+
+define i64 @test_zext_freeze_load_i16(ptr %p) {
+; CHECK-LABEL: test_zext_freeze_load_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w0, [x0]
+; CHECK-NEXT:    ret
+  %v = load i16, ptr %p
+  %f = freeze i16 %v
+  %e = zext i16 %f to i64
+  ret i64 %e
+}
+
+define i32 @test_sext_freeze_load_multiuse(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_multiuse:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    sxtb w9, w8
+; CHECK-NEXT:    add w0, w9, w8, uxtb
+; CHECK-NEXT:    ret
+  %v = load i8, ptr %p
+  %f = freeze i8 %v
+  %e = sext i8 %f to i32
+  %z = zext i8 %f to i32
+  %r = add i32 %e, %z
+  ret i32 %r
+}
+
+define <4 x i32> @test_sext_freeze_load_v4i16(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %v = load <4 x i16>, ptr %p
+  %f = freeze <4 x i16> %v
+  %e = sext <4 x i16> %f to <4 x i32>
+  ret <4 x i32> %e
+}
+
+define <4 x i32> @test_zext_freeze_load_v4i16(ptr %p) {
+; CHECK-LABEL: test_zext_freeze_load_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ret
+  %v = load <4 x i16>, ptr %p
+  %f = freeze <4 x i16> %v
+  %e = zext <4 x i16> %f to <4 x i32>
+  ret <4 x i32> %e
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
index 79849a7153c91..d9b4635042256 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
index 0f2c75b15d5b4..01b7618753a23 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
@@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index 1a712ffac5b7e..03f283a57a217 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -212,11 +212,9 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT:    vmovdqu (%rdi), %ymm2
-; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; KNL-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; KNL-NEXT:    vpsllw $15, %ymm0, %ymm0
 ; KNL-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; KNL-NEXT:    vpsllw $15, %ymm1, %ymm1
@@ -237,11 +235,9 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
 ; AVX512DQNOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT:    vmovdqu (%rdi), %ymm2
-; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512DQNOBW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512DQNOBW-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512DQNOBW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512DQNOBW-NEXT:    vpsllw $15, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpsllw $15, %ymm1, %ymm1
@@ -261,11 +257,9 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT:    vmovdqu (%rdi), %ymm2
-; KNL-NEXT:    vpmovsxbw %xmm2, %ymm3
-; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; KNL-NEXT:    vpmovsxbw %xmm2, %ymm2
-; KNL-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; KNL-NEXT:    vpmovsxbw (%rdi), %ymm2
+; KNL-NEXT:    vpmovsxbw 16(%rdi), %ymm3
+; KNL-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; KNL-NEXT:    vpsllw $15, %ymm0, %ymm0
 ; KNL-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; KNL-NEXT:    vpsllw $15, %ymm1, %ymm1
@@ -286,11 +280,9 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
 ; AVX512DQNOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT:    vmovdqu (%rdi), %ymm2
-; AVX512DQNOBW-NEXT:    vpmovsxbw %xmm2, %ymm3
-; AVX512DQNOBW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQNOBW-NEXT:    vpmovsxbw %xmm2, %ymm2
-; AVX512DQNOBW-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQNOBW-NEXT:    vpmovsxbw (%rdi), %ymm2
+; AVX512DQNOBW-NEXT:    vpmovsxbw 16(%rdi), %ymm3
+; AVX512DQNOBW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512DQNOBW-NEXT:    vpsllw $15, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpsllw $15, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 81c4d5d71084c..fce622a99bb6a 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -171,8 +171,8 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
@@ -180,8 +180,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    movl (%rdi), %eax
+; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
 ; X64-BMI2-NEXT:    retq
@@ -248,8 +248,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
@@ -257,8 +257,8 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    movl (%rdi), %eax
+; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
 ; X64-BMI2-NEXT:    retq
@@ -324,8 +324,8 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
@@ -333,8 +333,8 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    movl (%rdi), %eax
+; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movl %eax, (%rdx)
 ; X64-BMI2-NEXT:    retq

>From c396c3dcf7ce3ffff9279cac668f0d7233102ead Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Sun, 26 Oct 2025 17:57:21 +0200
Subject: [PATCH 2/2] Address review comment

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   45 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   18 +-
 .../AArch64/freeze-bitcast-ext-load.ll        |    3 +-
 llvm/test/CodeGen/AArch64/freeze.ll           |   12 +-
 llvm/test/CodeGen/AArch64/pr66603.ll          |    3 +-
 llvm/test/CodeGen/AArch64/vector-compress.ll  |   22 +-
 llvm/test/CodeGen/AArch64/vselect-ext.ll      |   24 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      |    8 +-
 .../atomic_optimizations_local_pointer.ll     |  240 +--
 llvm/test/CodeGen/AMDGPU/div_v2i128.ll        |  592 +++----
 llvm/test/CodeGen/AMDGPU/fnearbyint.ll        |   59 +-
 llvm/test/CodeGen/AMDGPU/frem.ll              |   72 +-
 llvm/test/CodeGen/AMDGPU/llvm.mulo.ll         |  160 +-
 llvm/test/CodeGen/AMDGPU/mul_int24.ll         |   74 +-
 llvm/test/CodeGen/AMDGPU/select-undef.ll      |    3 +-
 .../test/CodeGen/AMDGPU/vector-reduce-smax.ll |   12 +-
 .../test/CodeGen/AMDGPU/vector-reduce-smin.ll |   12 +-
 .../test/CodeGen/AMDGPU/vector-reduce-umax.ll |   12 +-
 .../test/CodeGen/AMDGPU/vector-reduce-umin.ll |   12 +-
 llvm/test/CodeGen/RISCV/pr66603.ll            |    4 -
 llvm/test/CodeGen/SystemZ/pr60413.ll          |   34 +-
 .../test/CodeGen/Thumb2/mve-pred-selectop3.ll |    4 +-
 llvm/test/CodeGen/X86/avx512-ext.ll           |   28 +-
 llvm/test/CodeGen/X86/freeze-binary.ll        |   22 +-
 llvm/test/CodeGen/X86/freeze.ll               |    2 -
 llvm/test/CodeGen/X86/midpoint-int-vec-512.ll |   84 +-
 llvm/test/CodeGen/X86/movmsk-cmp.ll           |    4 +-
 llvm/test/CodeGen/X86/pr162812.ll             |   50 +-
 llvm/test/CodeGen/X86/ushl_sat_vec.ll         |    2 +-
 llvm/test/CodeGen/X86/var-permute-128.ll      |   66 +-
 llvm/test/CodeGen/X86/vector-compress.ll      | 1165 +++++++-------
 ...ad-of-small-alloca-with-zero-upper-half.ll | 1408 ++++++++++-------
 .../CodeGen/X86/widen-load-of-small-alloca.ll |  265 ++--
 33 files changed, 2284 insertions(+), 2237 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 06a8c832fe4a2..ccb4a70e4dc23 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14815,6 +14815,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level))
     return Res;
 
+  if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse() && !VT.isVector()) {
+    SDValue Res =
+        DAG.getFreeze(DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0)));
+    return DAG.getNode(ISD::AssertSext, DL, VT, Res,
+                       DAG.getValueType(N0.getOperand(0).getValueType()));
+  }
+
   return SDValue();
 }
 
@@ -15194,6 +15201,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       return SDValue(CSENode, 0);
   }
 
+  if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse() && !VT.isVector()) {
+    SDValue Res =
+        DAG.getFreeze(DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)));
+    return DAG.getNode(ISD::AssertZext, DL, VT, Res,
+                       DAG.getValueType(N0.getOperand(0).getValueType()));
+  }
+
   return SDValue();
 }
 
@@ -15362,6 +15376,10 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level))
     return Res;
 
+  if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse())
+    return DAG.getFreeze(
+        DAG.getNode(ISD::ANY_EXTEND, DL, VT, N0.getOperand(0)));
+
   return SDValue();
 }
 
@@ -16911,6 +16929,11 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       return LegalShuffle;
   }
 
+  if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse()) {
+    SDLoc DL(N);
+    return DAG.getFreeze(DAG.getNode(ISD::BITCAST, DL, VT, N0.getOperand(0)));
+  }
+
   return SDValue();
 }
 
@@ -16943,23 +16966,11 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   // example https://reviews.llvm.org/D136529#4120959.
   if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
     return SDValue();
-
-  // fold: bitcast(freeze(load)) -> freeze(bitcast(load))
-  // fold: sext(freeze(load)) -> freeze(sext(load))
-  // fold: zext(freeze(load)) -> freeze(zext(load))
-  // This allows the conversion to potentially fold into the load.
-  if (N0.getOpcode() == ISD::LOAD && N->hasOneUse()) {
-    SDNode *User = *N->user_begin();
-    unsigned UserOpcode = User->getOpcode();
-    if (UserOpcode == ISD::BITCAST || UserOpcode == ISD::SIGN_EXTEND ||
-        UserOpcode == ISD::ZERO_EXTEND) {
-      SDValue NewConv =
-          DAG.getNode(UserOpcode, SDLoc(User), User->getValueType(0), N0);
-      SDValue FrozenConv = DAG.getFreeze(NewConv);
-      DAG.ReplaceAllUsesWith(User, FrozenConv.getNode());
-      return SDValue(N, 0);
-    }
-  }
+  // Avoid folding extensions and bitcasts. Each of these operations handles
+  // FREEZE in their own respective visitors.
+  if (N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND ||
+      N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::BITCAST)
+    return SDValue();
 
   // Fold freeze(op(x, ...)) -> op(freeze(x), ...).
   // Try to push freeze through instructions that propagate but don't produce
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5b677f6692ea6..d5c4235d2c5a0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3448,19 +3448,11 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
   if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
     return false;
 
-  // With low alignment, don't convert integer vectors to large scalar loads,
-  // because otherwise they get broken into many small scalar loads.
-  if (LoadVT.isVector() && LoadVT.isInteger() && !BitcastVT.isVector() &&
-      BitcastVT.isInteger()) {
-    const DataLayout &DL = DAG.getDataLayout();
-    unsigned MinAlign = DL.getPointerSize();
-    // Aligned well, will legalize into a clean sequence of loads.
-    if (MMO.getAlign() >= MinAlign)
-      return true;
-    // Aligned poorly for a large enough scalar.
-    if (BitcastVT.getSizeInBits() > 2 * DL.getPointerSizeInBits())
-      return false;
-  }
+  // If we have a large vector type (even if illegal), don't bitcast to large
+  // (illegal) scalar types. Better to load fewer vectors and extract.
+  if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() &&
+      BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0)
+    return false;
 
   // If both types are legal vectors, it's always ok to convert them.
   if (LoadVT.isVector() && BitcastVT.isVector() &&
diff --git a/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll b/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll
index 8124d35b063a7..361005dfb8664 100644
--- a/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll
+++ b/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll
@@ -26,8 +26,7 @@ define i32 @test_sext_freeze_load_i8(ptr %p) {
 define i64 @test_sext_freeze_load_i32(ptr %p) {
 ; CHECK-LABEL: test_sext_freeze_load_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    sxtw x0, w8
+; CHECK-NEXT:    ldrsw x0, [x0]
 ; CHECK-NEXT:    ret
   %v = load i32, ptr %p
   %f = freeze i32 %v
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
index fb909fec90434..5920de998977a 100644
--- a/llvm/test/CodeGen/AArch64/freeze.ll
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -376,10 +376,14 @@ define i32 @freeze_anonstruct() {
 }
 
 define i32 @freeze_anonstruct2() {
-; CHECK-LABEL: freeze_anonstruct2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w0, w8, w8, uxth
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: freeze_anonstruct2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_anonstruct2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w0, w8, w8, uxth
+; CHECK-GI-NEXT:    ret
   %y1 = freeze {i32, i16} undef
   %v1 = extractvalue {i32, i16} %y1, 0
   %v2 = extractvalue {i32, i16} %y1, 1
diff --git a/llvm/test/CodeGen/AArch64/pr66603.ll b/llvm/test/CodeGen/AArch64/pr66603.ll
index 2373b722fa04b..c265a9d5606f3 100644
--- a/llvm/test/CodeGen/AArch64/pr66603.ll
+++ b/llvm/test/CodeGen/AArch64/pr66603.ll
@@ -5,8 +5,7 @@
 define i32 @PR66603(double %x) nounwind {
 ; CHECK-LABEL: PR66603:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzs w8, d0
-; CHECK-NEXT:    sxtb w0, w8
+; CHECK-NEXT:    fcvtzs w0, d0
 ; CHECK-NEXT:    ret
   %as_i8 = fptosi double %x to i8
   %frozen_i8 = freeze i8 %as_i8
diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll
index 55c343164a1b8..78f5843442422 100644
--- a/llvm/test/CodeGen/AArch64/vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/vector-compress.ll
@@ -12,15 +12,16 @@ define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) {
 ; CHECK-NEXT:    shl.4s v1, v1, #31
 ; CHECK-NEXT:    cmlt.4s v1, v1, #0
 ; CHECK-NEXT:    mov.s w9, v1[1]
-; CHECK-NEXT:    fmov w11, s1
 ; CHECK-NEXT:    mov.s w10, v1[2]
-; CHECK-NEXT:    and x12, x11, #0x1
+; CHECK-NEXT:    fmov w11, s1
 ; CHECK-NEXT:    bfi x8, x11, #2, #1
-; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    and x11, x11, #0x1
 ; CHECK-NEXT:    and x9, x9, #0x1
-; CHECK-NEXT:    add x9, x12, x9
+; CHECK-NEXT:    and w10, w10, #0x1
+; CHECK-NEXT:    add x9, x11, x9
+; CHECK-NEXT:    mov x11, sp
 ; CHECK-NEXT:    st1.s { v0 }[1], [x8]
-; CHECK-NEXT:    sub w10, w9, w10
+; CHECK-NEXT:    add w10, w9, w10
 ; CHECK-NEXT:    orr x9, x11, x9, lsl #2
 ; CHECK-NEXT:    bfi x11, x10, #2, #2
 ; CHECK-NEXT:    st1.s { v0 }[2], [x9]
@@ -420,15 +421,16 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
 ; CHECK-NEXT:    shl.4s v1, v1, #31
 ; CHECK-NEXT:    cmlt.4s v1, v1, #0
 ; CHECK-NEXT:    mov.s w8, v1[1]
-; CHECK-NEXT:    fmov w10, s1
 ; CHECK-NEXT:    mov.s w9, v1[2]
-; CHECK-NEXT:    and x12, x10, #0x1
+; CHECK-NEXT:    fmov w10, s1
 ; CHECK-NEXT:    bfi x11, x10, #2, #1
-; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    and x10, x10, #0x1
 ; CHECK-NEXT:    and x8, x8, #0x1
-; CHECK-NEXT:    add x8, x12, x8
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    add x8, x10, x8
+; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    st1.s { v0 }[1], [x11]
-; CHECK-NEXT:    sub w9, w8, w9
+; CHECK-NEXT:    add w9, w8, w9
 ; CHECK-NEXT:    orr x8, x10, x8, lsl #2
 ; CHECK-NEXT:    bfi x10, x9, #2, #2
 ; CHECK-NEXT:    st1.s { v0 }[2], [x8]
diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll
index 4f2b9c5a62669..c61c59068a319 100644
--- a/llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -594,10 +594,10 @@ define void @extension_in_loop_v16i8_to_v16i32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    cmge.16b v5, v4, #0
-; CHECK-NEXT:    tbl.16b v7, { v4 }, v0
-; CHECK-NEXT:    tbl.16b v16, { v4 }, v1
-; CHECK-NEXT:    tbl.16b v18, { v4 }, v2
-; CHECK-NEXT:    tbl.16b v4, { v4 }, v3
+; CHECK-NEXT:    tbl.16b v7, { v4 }, v3
+; CHECK-NEXT:    tbl.16b v16, { v4 }, v2
+; CHECK-NEXT:    tbl.16b v18, { v4 }, v1
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v0
 ; CHECK-NEXT:    sshll2.8h v6, v5, #0
 ; CHECK-NEXT:    sshll.8h v5, v5, #0
 ; CHECK-NEXT:    sshll2.4s v17, v6, #0
@@ -664,10 +664,10 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    cmge.16b v5, v4, #0
-; CHECK-NEXT:    tbl.16b v7, { v4 }, v0
-; CHECK-NEXT:    tbl.16b v16, { v4 }, v1
-; CHECK-NEXT:    tbl.16b v18, { v4 }, v2
-; CHECK-NEXT:    tbl.16b v4, { v4 }, v3
+; CHECK-NEXT:    tbl.16b v7, { v4 }, v3
+; CHECK-NEXT:    tbl.16b v16, { v4 }, v2
+; CHECK-NEXT:    tbl.16b v18, { v4 }, v1
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v0
 ; CHECK-NEXT:    sshll2.8h v6, v5, #0
 ; CHECK-NEXT:    sshll.8h v5, v5, #0
 ; CHECK-NEXT:    sshll2.4s v17, v6, #0
@@ -735,10 +735,10 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    cmge.16b v5, v4, #0
-; CHECK-NEXT:    tbl.16b v7, { v4 }, v0
-; CHECK-NEXT:    tbl.16b v16, { v4 }, v1
-; CHECK-NEXT:    tbl.16b v18, { v4 }, v2
-; CHECK-NEXT:    tbl.16b v4, { v4 }, v3
+; CHECK-NEXT:    tbl.16b v7, { v4 }, v3
+; CHECK-NEXT:    tbl.16b v16, { v4 }, v2
+; CHECK-NEXT:    tbl.16b v18, { v4 }, v1
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v0
 ; CHECK-NEXT:    sshll2.8h v6, v5, #0
 ; CHECK-NEXT:    sshll.8h v5, v5, #0
 ; CHECK-NEXT:    sshll2.4s v17, v6, #0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 948811ea45f77..eacd960153c29 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7769,7 +7769,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xd
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
@@ -7938,7 +7938,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
 ; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
@@ -9037,7 +9037,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX6-LABEL: srem_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xd
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
@@ -9208,7 +9208,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX9-LABEL: srem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
 ; GFX9-NEXT:    s_ashr_i32 s2, s1, 31
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 23c5f4f5506f3..d4b3f5c303467 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -11184,19 +11184,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX7LESS_ITERATIVE-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
-; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s8
-; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
-; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
+; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s10
+; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
 ; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
-; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX7LESS_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
-; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
-; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
-; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
 ; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
@@ -11241,19 +11241,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX8_ITERATIVE-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
-; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
-; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
-; GFX8_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s10
+; GFX8_ITERATIVE-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
-; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
@@ -11297,19 +11297,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX9_ITERATIVE-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s6
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX9_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
-; GFX9_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s10
+; GFX9_ITERATIVE-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
-; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
@@ -13010,19 +13010,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX7LESS_ITERATIVE-NEXT:  .LBB26_1: ; %ComputeLoop
 ; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
-; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s8
-; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
-; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
+; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s10
+; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
 ; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
-; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX7LESS_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
-; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
-; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
-; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
 ; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
@@ -13067,19 +13067,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX8_ITERATIVE-NEXT:  .LBB26_1: ; %ComputeLoop
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
-; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
-; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
-; GFX8_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s10
+; GFX8_ITERATIVE-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
-; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
@@ -13123,19 +13123,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX9_ITERATIVE-NEXT:  .LBB26_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s6
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX9_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
-; GFX9_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s10
+; GFX9_ITERATIVE-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
-; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
@@ -14831,19 +14831,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX7LESS_ITERATIVE-NEXT:  .LBB29_1: ; %ComputeLoop
 ; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
-; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s8
-; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
-; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
+; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s10
+; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
 ; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
-; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX7LESS_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
-; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
-; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
-; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
-; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
 ; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
@@ -14887,19 +14887,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX8_ITERATIVE-NEXT:  .LBB29_1: ; %ComputeLoop
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
-; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
-; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
-; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
-; GFX8_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s10
+; GFX8_ITERATIVE-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
-; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
@@ -14942,19 +14942,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX9_ITERATIVE-NEXT:  .LBB29_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s6
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX9_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
-; GFX9_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s10
+; GFX9_ITERATIVE-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
-; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
@@ -16645,19 +16645,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX7LESS_ITERATIVE-NEXT:  .LBB32_1: ; %ComputeLoop
 ; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
-; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s8
-; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
-; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
+; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s10
+; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
 ; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
-; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX7LESS_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
-; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
-; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
-; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
-; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 s[8:9], vcc, exec
+; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX7LESS_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
 ; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
@@ -16701,19 +16701,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX8_ITERATIVE-NEXT:  .LBB32_1: ; %ComputeLoop
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
-; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
-; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
-; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
-; GFX8_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s10
+; GFX8_ITERATIVE-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
-; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
@@ -16756,19 +16756,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX9_ITERATIVE-NEXT:  .LBB32_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s10, v0, s8
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s6
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX9_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
-; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
-; GFX9_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s10
+; GFX9_ITERATIVE-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
-; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
-; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
+; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 0fc54aeaef77b..c187aac4fc4a2 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -6,77 +6,77 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-LABEL: v_sdiv_v2i128_vv:
 ; SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, 0, v0
-; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_sub_i32_e32 v18, vcc, 0, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v24, 31, v3
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v25, 31, v11
-; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, 0, v1, vcc
+; SDAG-NEXT:    s_mov_b64 s[8:9], 0x7f
+; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, 0, v1, vcc
 ; SDAG-NEXT:    v_mov_b32_e32 v26, v24
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v25
-; SDAG-NEXT:    v_subb_u32_e32 v19, vcc, 0, v2, vcc
-; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT:    v_cndmask_b32_e64 v21, v1, v17, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v20, v0, v16, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, 0, v3, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v16, v2, v19, s[4:5]
-; SDAG-NEXT:    v_ffbh_u32_e32 v1, v20
-; SDAG-NEXT:    v_ffbh_u32_e32 v2, v21
-; SDAG-NEXT:    v_cndmask_b32_e64 v17, v3, v0, s[4:5]
-; SDAG-NEXT:    v_or_b32_e32 v0, v20, v16
-; SDAG-NEXT:    v_sub_i32_e32 v3, vcc, 0, v8
-; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], 32, v1
-; SDAG-NEXT:    v_ffbh_u32_e32 v22, v16
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, 0, v2, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, 0, v3, vcc
+; SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e32 v17, v3, v17, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v16, v2, v16, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v21, v1, v20, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v20, v0, v18, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, 0, v8
 ; SDAG-NEXT:    v_or_b32_e32 v1, v21, v17
-; SDAG-NEXT:    v_subb_u32_e32 v23, vcc, 0, v9, vcc
-; SDAG-NEXT:    v_min_u32_e32 v2, v19, v2
-; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], 32, v22
+; SDAG-NEXT:    v_or_b32_e32 v0, v20, v16
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v18, v16
 ; SDAG-NEXT:    v_ffbh_u32_e32 v22, v17
+; SDAG-NEXT:    v_ffbh_u32_e32 v23, v20
+; SDAG-NEXT:    v_ffbh_u32_e32 v28, v21
 ; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v28, v9, v23, s[6:7]
 ; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, 0, v10, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v29, v8, v3, s[6:7]
-; SDAG-NEXT:    v_min_u32_e32 v1, v19, v22
-; SDAG-NEXT:    v_add_i32_e64 v2, s[8:9], 64, v2
-; SDAG-NEXT:    v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, 0, v11, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, v10, v0, s[6:7]
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT:    v_cndmask_b32_e64 v9, v3, 0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v10, v2, v1, vcc
-; SDAG-NEXT:    v_ffbh_u32_e32 v3, v29
-; SDAG-NEXT:    v_ffbh_u32_e32 v19, v28
-; SDAG-NEXT:    v_cndmask_b32_e64 v1, v11, v8, s[6:7]
-; SDAG-NEXT:    v_or_b32_e32 v2, v29, v0
-; SDAG-NEXT:    v_add_i32_e32 v8, vcc, 32, v3
-; SDAG-NEXT:    v_ffbh_u32_e32 v11, v0
+; SDAG-NEXT:    v_add_i32_e64 v1, s[6:7], 32, v18
+; SDAG-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v23
+; SDAG-NEXT:    v_subb_u32_e32 v23, vcc, 0, v11, vcc
+; SDAG-NEXT:    v_min_u32_e32 v22, v1, v22
+; SDAG-NEXT:    v_min_u32_e32 v18, v18, v28
+; SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v11, v23, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v28, v9, v3, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v29, v8, v2, vcc
+; SDAG-NEXT:    v_add_i32_e32 v8, vcc, 64, v18
+; SDAG-NEXT:    v_addc_u32_e64 v9, s[6:7], 0, 0, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v3, v28, v1
-; SDAG-NEXT:    v_min_u32_e32 v8, v8, v19
-; SDAG-NEXT:    v_add_i32_e32 v11, vcc, 32, v11
-; SDAG-NEXT:    v_ffbh_u32_e32 v19, v1
+; SDAG-NEXT:    v_or_b32_e32 v2, v29, v0
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v22, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v10, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v11, v1
+; SDAG-NEXT:    v_ffbh_u32_e32 v18, v29
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v28
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT:    v_min_u32_e32 v2, v11, v19
-; SDAG-NEXT:    v_add_i32_e64 v3, s[6:7], 64, v8
-; SDAG-NEXT:    v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7]
-; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, v8, 0, s[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[6:7]
+; SDAG-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v10
+; SDAG-NEXT:    v_add_i32_e64 v3, s[6:7], 32, v18
+; SDAG-NEXT:    v_min_u32_e32 v2, v2, v11
+; SDAG-NEXT:    v_min_u32_e32 v3, v3, v22
 ; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
-; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v2, v10
-; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, v11, v9, vcc
+; SDAG-NEXT:    v_add_i32_e32 v3, vcc, 64, v3
+; SDAG-NEXT:    v_addc_u32_e64 v10, s[4:5], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v2, v8
+; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, v10, v9, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v2, 0x7f, v8
-; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v18, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9]
-; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v18, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v19, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[8:9], v[8:9]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v19, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v2, v2, v10
 ; SDAG-NEXT:    v_or_b32_e32 v3, v9, v11
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v19, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v19, v18, s[4:5]
 ; SDAG-NEXT:    v_and_b32_e32 v2, 1, v2
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
 ; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
@@ -1564,67 +1564,67 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, 0, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v28, 31, v3
-; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT:    s_mov_b64 s[8:9], 0x7f
 ; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, 0, v1, vcc
 ; SDAG-NEXT:    v_mov_b32_e32 v29, v28
 ; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, 0, v2, vcc
-; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT:    v_cndmask_b32_e64 v17, v1, v17, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v16, v0, v16, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, 0, v3, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, v18, s[4:5]
-; SDAG-NEXT:    v_ffbh_u32_e32 v18, v16
-; SDAG-NEXT:    v_ffbh_u32_e32 v20, v17
+; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, 0, v3, vcc
 ; SDAG-NEXT:    v_sub_i32_e32 v21, vcc, 0, v8
-; SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT:    v_or_b32_e32 v2, v16, v0
-; SDAG-NEXT:    v_add_i32_e64 v18, s[4:5], 32, v18
-; SDAG-NEXT:    v_ffbh_u32_e32 v22, v0
-; SDAG-NEXT:    v_subb_u32_e32 v23, vcc, 0, v9, vcc
-; SDAG-NEXT:    v_or_b32_e32 v3, v17, v1
-; SDAG-NEXT:    v_min_u32_e32 v18, v18, v20
-; SDAG-NEXT:    v_add_i32_e64 v20, s[4:5], 32, v22
-; SDAG-NEXT:    v_ffbh_u32_e32 v22, v1
-; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v30, v9, v23, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, 0, v10, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v31, v8, v21, s[4:5]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
-; SDAG-NEXT:    v_min_u32_e32 v3, v20, v22
-; SDAG-NEXT:    v_add_i32_e64 v8, s[8:9], 64, v18
-; SDAG-NEXT:    v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v3, v20, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_or_b32_e32 v17, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v16, v0, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v20, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v3
+; SDAG-NEXT:    v_ffbh_u32_e32 v23, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v24, v1
+; SDAG-NEXT:    v_subb_u32_e32 v25, vcc, 0, v10, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[16:17]
+; SDAG-NEXT:    v_add_i32_e64 v16, s[6:7], 32, v20
+; SDAG-NEXT:    v_add_i32_e64 v17, s[6:7], 32, v23
 ; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, 0, v11, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v9, s[4:5]
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_min_u32_e32 v16, v16, v22
+; SDAG-NEXT:    v_min_u32_e32 v17, v17, v24
+; SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e32 v11, v11, v20, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, v10, v25, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v30, v9, v18, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v31, v8, v21, vcc
+; SDAG-NEXT:    v_add_i32_e32 v17, vcc, 64, v17
+; SDAG-NEXT:    v_addc_u32_e64 v18, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_or_b32_e32 v9, v30, v11
+; SDAG-NEXT:    v_or_b32_e32 v8, v31, v10
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v18, v18, 0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v10, v8, v3, vcc
-; SDAG-NEXT:    v_ffbh_u32_e32 v9, v31
-; SDAG-NEXT:    v_ffbh_u32_e32 v21, v30
-; SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v20, s[4:5]
-; SDAG-NEXT:    v_or_b32_e32 v8, v31, v2
-; SDAG-NEXT:    v_add_i32_e32 v11, vcc, 32, v9
-; SDAG-NEXT:    v_ffbh_u32_e32 v20, v2
-; SDAG-NEXT:    v_or_b32_e32 v9, v30, v3
-; SDAG-NEXT:    v_min_u32_e32 v11, v11, v21
-; SDAG-NEXT:    v_add_i32_e32 v20, vcc, 32, v20
-; SDAG-NEXT:    v_ffbh_u32_e32 v21, v3
+; SDAG-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v17, v10
+; SDAG-NEXT:    v_ffbh_u32_e32 v20, v11
+; SDAG-NEXT:    v_ffbh_u32_e32 v21, v31
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v30
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT:    v_min_u32_e32 v8, v20, v21
-; SDAG-NEXT:    v_add_i32_e64 v9, s[4:5], 64, v11
-; SDAG-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, v11, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, v9, v8, s[4:5]
-; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, v8, v10
-; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v11, v18, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v8, 0x7f, v10
+; SDAG-NEXT:    v_add_i32_e64 v8, s[6:7], 32, v17
+; SDAG-NEXT:    v_add_i32_e64 v9, s[6:7], 32, v21
+; SDAG-NEXT:    v_min_u32_e32 v8, v8, v20
+; SDAG-NEXT:    v_min_u32_e32 v9, v9, v22
+; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT:    v_add_i32_e32 v9, vcc, 64, v9
+; SDAG-NEXT:    v_addc_u32_e64 v17, s[4:5], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v17, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v8, v16
+; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, v17, v18, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v8, 0x7f, v16
 ; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, 0, v19, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
 ; SDAG-NEXT:    v_subb_u32_e32 v19, vcc, 0, v19, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v8, v8, v18
-; SDAG-NEXT:    v_or_b32_e32 v9, v11, v19
+; SDAG-NEXT:    v_or_b32_e32 v9, v17, v19
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
@@ -1633,71 +1633,71 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_and_b32_e32 v8, 1, v8
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v8
 ; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v34, v1, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v34, v3, 0, s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT:    v_cndmask_b32_e64 v32, v0, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v27, v17, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v32, v2, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v27, v1, 0, s[4:5]
 ; SDAG-NEXT:    s_and_b64 s[8:9], s[6:7], vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v33, v16, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v33, v0, 0, s[4:5]
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[8:9]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_6
 ; SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
-; SDAG-NEXT:    v_add_i32_e32 v32, vcc, 1, v10
-; SDAG-NEXT:    v_sub_i32_e64 v20, s[4:5], 63, v10
+; SDAG-NEXT:    v_add_i32_e32 v32, vcc, 1, v16
+; SDAG-NEXT:    v_sub_i32_e64 v20, s[4:5], 63, v16
 ; SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v9, 0
-; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v11, vcc
-; SDAG-NEXT:    v_lshl_b64 v[20:21], v[16:17], v20
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v17, vcc
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[0:1], v20
 ; SDAG-NEXT:    v_addc_u32_e32 v34, vcc, 0, v18, vcc
 ; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, 0, v19, vcc
-; SDAG-NEXT:    v_or_b32_e32 v18, v32, v34
-; SDAG-NEXT:    v_sub_i32_e32 v24, vcc, 0x7f, v10
-; SDAG-NEXT:    v_or_b32_e32 v19, v33, v35
-; SDAG-NEXT:    v_lshl_b64 v[10:11], v[0:1], v24
-; SDAG-NEXT:    v_sub_i32_e32 v25, vcc, 64, v24
-; SDAG-NEXT:    v_lshl_b64 v[22:23], v[16:17], v24
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT:    v_lshr_b64 v[18:19], v[16:17], v25
-; SDAG-NEXT:    v_or_b32_e32 v11, v11, v19
-; SDAG-NEXT:    v_or_b32_e32 v10, v10, v18
-; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v24
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, v21, v11, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, v20, v10, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, v23, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, v22, s[4:5]
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, v11, v1, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, v10, v0, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v17, v32, v34
+; SDAG-NEXT:    v_sub_i32_e32 v19, vcc, 0x7f, v16
+; SDAG-NEXT:    v_or_b32_e32 v18, v33, v35
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[2:3], v19
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, 64, v19
+; SDAG-NEXT:    v_lshl_b64 v[24:25], v[0:1], v19
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[17:18]
+; SDAG-NEXT:    v_lshr_b64 v[16:17], v[0:1], v16
+; SDAG-NEXT:    v_or_b32_e32 v17, v23, v17
+; SDAG-NEXT:    v_or_b32_e32 v16, v22, v16
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v19
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, v17, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v20, v16, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, v25, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, v24, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v17, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v16, v2, s[4:5]
 ; SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_5
 ; SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT:    v_lshr_b64 v[22:23], v[16:17], v32
+; SDAG-NEXT:    v_lshr_b64 v[22:23], v[0:1], v32
 ; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, 64, v32
 ; SDAG-NEXT:    v_subrev_i32_e32 v37, vcc, 64, v32
-; SDAG-NEXT:    v_lshr_b64 v[24:25], v[0:1], v32
+; SDAG-NEXT:    v_lshr_b64 v[24:25], v[2:3], v32
 ; SDAG-NEXT:    v_add_i32_e32 v36, vcc, -1, v31
 ; SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
 ; SDAG-NEXT:    v_mov_b32_e32 v9, 0
-; SDAG-NEXT:    v_lshl_b64 v[26:27], v[0:1], v8
-; SDAG-NEXT:    v_lshr_b64 v[48:49], v[0:1], v37
+; SDAG-NEXT:    v_lshl_b64 v[26:27], v[2:3], v8
+; SDAG-NEXT:    v_lshr_b64 v[48:49], v[2:3], v37
 ; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v30, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v8, v23, v27
 ; SDAG-NEXT:    v_or_b32_e32 v22, v22, v26
-; SDAG-NEXT:    v_addc_u32_e32 v38, vcc, -1, v2, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v38, vcc, -1, v10, vcc
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v32
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, v49, v8, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v22, v48, v22, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v27, 0, v25, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v26, 0, v24, s[4:5]
-; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, -1, v3, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, -1, v11, vcc
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v32
-; SDAG-NEXT:    v_cndmask_b32_e32 v25, v8, v17, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v24, v22, v16, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v25, v8, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v24, v22, v0, vcc
 ; SDAG-NEXT:    v_mov_b32_e32 v22, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v23, 0
 ; SDAG-NEXT:  .LBB2_3: ; %udiv-do-while3
@@ -1707,13 +1707,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_lshl_b64 v[26:27], v[26:27], 1
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v48, 31, v25
 ; SDAG-NEXT:    v_lshl_b64 v[24:25], v[24:25], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v49, 31, v11
-; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v49, 31, v17
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
 ; SDAG-NEXT:    v_or_b32_e32 v21, v23, v21
 ; SDAG-NEXT:    v_or_b32_e32 v20, v22, v20
 ; SDAG-NEXT:    v_or_b32_e32 v22, v26, v48
 ; SDAG-NEXT:    v_or_b32_e32 v23, v24, v49
-; SDAG-NEXT:    v_or_b32_e32 v10, v10, v8
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v8
 ; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v36, v23
 ; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v37, v25, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v38, v22, vcc
@@ -1721,8 +1721,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; SDAG-NEXT:    v_and_b32_e32 v24, v8, v31
 ; SDAG-NEXT:    v_and_b32_e32 v26, v8, v30
-; SDAG-NEXT:    v_and_b32_e32 v48, v8, v2
-; SDAG-NEXT:    v_and_b32_e32 v49, v8, v3
+; SDAG-NEXT:    v_and_b32_e32 v48, v8, v10
+; SDAG-NEXT:    v_and_b32_e32 v49, v8, v11
 ; SDAG-NEXT:    v_and_b32_e32 v8, 1, v8
 ; SDAG-NEXT:    v_sub_i32_e32 v24, vcc, v23, v24
 ; SDAG-NEXT:    v_subb_u32_e32 v25, vcc, v25, v26, vcc
@@ -1735,9 +1735,9 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_or_b32_e32 v22, v32, v34
 ; SDAG-NEXT:    v_or_b32_e32 v23, v33, v35
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[22:23]
-; SDAG-NEXT:    v_or_b32_e32 v11, v19, v11
+; SDAG-NEXT:    v_or_b32_e32 v17, v19, v17
 ; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT:    v_or_b32_e32 v10, v18, v10
+; SDAG-NEXT:    v_or_b32_e32 v16, v18, v16
 ; SDAG-NEXT:    v_mov_b32_e32 v23, v9
 ; SDAG-NEXT:    v_mov_b32_e32 v22, v8
 ; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
@@ -1746,123 +1746,123 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; SDAG-NEXT:  .LBB2_5: ; %Flow14
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v22, 31, v21
-; SDAG-NEXT:    v_or_b32_e32 v10, v10, v22
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v22
 ; SDAG-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT:    v_or_b32_e32 v34, v19, v11
-; SDAG-NEXT:    v_or_b32_e32 v32, v18, v10
+; SDAG-NEXT:    v_or_b32_e32 v34, v19, v17
+; SDAG-NEXT:    v_or_b32_e32 v32, v18, v16
 ; SDAG-NEXT:    v_or_b32_e32 v27, v9, v21
 ; SDAG-NEXT:    v_or_b32_e32 v33, v8, v20
 ; SDAG-NEXT:  .LBB2_6: ; %Flow16
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v26, 31, v7
 ; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
-; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
 ; SDAG-NEXT:    v_mov_b32_e32 v35, v26
 ; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, 0, v5, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v6, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, 0, v6, vcc
 ; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v9, v5, v9, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, v4, v8, s[4:5]
 ; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, 0, v7, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, v10, s[4:5]
-; SDAG-NEXT:    v_ffbh_u32_e32 v10, v8
-; SDAG-NEXT:    v_ffbh_u32_e32 v11, v9
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, v16, s[4:5]
+; SDAG-NEXT:    v_ffbh_u32_e32 v16, v8
+; SDAG-NEXT:    v_ffbh_u32_e32 v18, v9
 ; SDAG-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; SDAG-NEXT:    v_sub_i32_e32 v19, vcc, 0, v12
 ; SDAG-NEXT:    v_or_b32_e32 v6, v8, v4
 ; SDAG-NEXT:    v_ffbh_u32_e32 v20, v4
-; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], 32, v10
+; SDAG-NEXT:    v_add_i32_e64 v16, s[4:5], 32, v16
 ; SDAG-NEXT:    v_subb_u32_e32 v21, vcc, 0, v13, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v7, v9, v5
 ; SDAG-NEXT:    v_add_i32_e64 v20, s[4:5], 32, v20
 ; SDAG-NEXT:    v_ffbh_u32_e32 v22, v5
-; SDAG-NEXT:    v_min_u32_e32 v10, v10, v11
-; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v14, vcc
+; SDAG-NEXT:    v_min_u32_e32 v16, v16, v18
+; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, 0, v14, vcc
 ; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v36, v13, v21, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v37, v12, v19, s[4:5]
 ; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[6:7]
 ; SDAG-NEXT:    v_min_u32_e32 v7, v20, v22
-; SDAG-NEXT:    v_add_i32_e64 v10, s[8:9], 64, v10
-; SDAG-NEXT:    v_addc_u32_e64 v12, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT:    v_subb_u32_e32 v13, vcc, 0, v15, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v11, s[4:5]
-; SDAG-NEXT:    v_ffbh_u32_e32 v11, v37
-; SDAG-NEXT:    v_ffbh_u32_e32 v14, v36
+; SDAG-NEXT:    v_add_i32_e64 v12, s[8:9], 64, v16
+; SDAG-NEXT:    v_addc_u32_e64 v13, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, 0, v15, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v18, s[4:5]
+; SDAG-NEXT:    v_ffbh_u32_e32 v14, v37
+; SDAG-NEXT:    v_ffbh_u32_e32 v18, v36
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, v12, 0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v19, v10, v7, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v13, s[4:5]
-; SDAG-NEXT:    v_or_b32_e32 v10, v37, v6
-; SDAG-NEXT:    v_ffbh_u32_e32 v13, v6
-; SDAG-NEXT:    v_add_i32_e32 v15, vcc, 32, v11
-; SDAG-NEXT:    v_or_b32_e32 v11, v36, v7
-; SDAG-NEXT:    v_add_i32_e32 v13, vcc, 32, v13
-; SDAG-NEXT:    v_ffbh_u32_e32 v20, v7
-; SDAG-NEXT:    v_min_u32_e32 v14, v15, v14
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT:    v_min_u32_e32 v10, v13, v20
-; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], 64, v14
-; SDAG-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, v13, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v20, v12, v7, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v16, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v12, v37, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v15, v6
+; SDAG-NEXT:    v_add_i32_e32 v14, vcc, 32, v14
+; SDAG-NEXT:    v_or_b32_e32 v13, v36, v7
+; SDAG-NEXT:    v_add_i32_e32 v15, vcc, 32, v15
+; SDAG-NEXT:    v_ffbh_u32_e32 v16, v7
+; SDAG-NEXT:    v_min_u32_e32 v14, v14, v18
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; SDAG-NEXT:    v_min_u32_e32 v12, v15, v16
+; SDAG-NEXT:    v_add_i32_e64 v13, s[4:5], 64, v14
+; SDAG-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5]
 ; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v13, v13, 0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
-; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
-; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v13, v12, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v12, 0x7f, v10
-; SDAG-NEXT:    v_subb_u32_e32 v14, vcc, 0, v18, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v15, vcc, 0, v18, vcc
-; SDAG-NEXT:    v_or_b32_e32 v12, v12, v14
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v14, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v12, vcc, v12, v20
+; SDAG-NEXT:    v_subb_u32_e32 v13, vcc, v14, v19, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v14, 0x7f, v12
+; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, 0, v17, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[12:13]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, 0, v17, vcc
+; SDAG-NEXT:    v_or_b32_e32 v14, v14, v16
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v15, v13, v17
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v13, v11, v15
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; SDAG-NEXT:    v_and_b32_e32 v12, 1, v18
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v12
+; SDAG-NEXT:    v_and_b32_e32 v14, 1, v18
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v14
 ; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v19, v5, 0, s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v18, v4, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v13, v9, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, v8, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v15, v9, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v8, 0, s[4:5]
 ; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_12
 ; SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
-; SDAG-NEXT:    v_add_i32_e32 v38, vcc, 1, v10
-; SDAG-NEXT:    v_sub_i32_e64 v18, s[4:5], 63, v10
-; SDAG-NEXT:    v_mov_b32_e32 v12, 0
-; SDAG-NEXT:    v_mov_b32_e32 v13, 0
-; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, 0, v11, vcc
+; SDAG-NEXT:    v_add_i32_e32 v38, vcc, 1, v12
+; SDAG-NEXT:    v_sub_i32_e64 v18, s[4:5], 63, v12
+; SDAG-NEXT:    v_mov_b32_e32 v14, 0
+; SDAG-NEXT:    v_mov_b32_e32 v15, 0
+; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, 0, v13, vcc
 ; SDAG-NEXT:    v_lshl_b64 v[18:19], v[8:9], v18
-; SDAG-NEXT:    v_addc_u32_e32 v48, vcc, 0, v14, vcc
-; SDAG-NEXT:    v_addc_u32_e32 v49, vcc, 0, v15, vcc
-; SDAG-NEXT:    v_or_b32_e32 v14, v38, v48
-; SDAG-NEXT:    v_sub_i32_e32 v22, vcc, 0x7f, v10
-; SDAG-NEXT:    v_or_b32_e32 v15, v39, v49
-; SDAG-NEXT:    v_lshl_b64 v[10:11], v[4:5], v22
+; SDAG-NEXT:    v_addc_u32_e32 v48, vcc, 0, v16, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v49, vcc, 0, v17, vcc
+; SDAG-NEXT:    v_or_b32_e32 v16, v38, v48
+; SDAG-NEXT:    v_sub_i32_e32 v22, vcc, 0x7f, v12
+; SDAG-NEXT:    v_or_b32_e32 v17, v39, v49
+; SDAG-NEXT:    v_lshl_b64 v[12:13], v[4:5], v22
 ; SDAG-NEXT:    v_sub_i32_e32 v23, vcc, 64, v22
 ; SDAG-NEXT:    v_lshl_b64 v[20:21], v[8:9], v22
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT:    v_lshr_b64 v[14:15], v[8:9], v23
-; SDAG-NEXT:    v_or_b32_e32 v11, v11, v15
-; SDAG-NEXT:    v_or_b32_e32 v10, v10, v14
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_lshr_b64 v[16:17], v[8:9], v23
+; SDAG-NEXT:    v_or_b32_e32 v13, v13, v17
+; SDAG-NEXT:    v_or_b32_e32 v12, v12, v16
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v22
-; SDAG-NEXT:    v_cndmask_b32_e64 v14, v19, v11, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, v18, v10, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v21, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v20, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v19, v13, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v18, v12, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, v21, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v20, s[4:5]
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v22
-; SDAG-NEXT:    v_cndmask_b32_e64 v15, v14, v5, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v14, v18, v4, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v16, v5, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v18, v4, s[4:5]
 ; SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1870,52 +1870,52 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_11
 ; SDAG-NEXT:  ; %bb.8: ; %udiv-preheader
 ; SDAG-NEXT:    v_lshr_b64 v[20:21], v[8:9], v38
-; SDAG-NEXT:    v_sub_i32_e32 v12, vcc, 64, v38
+; SDAG-NEXT:    v_sub_i32_e32 v14, vcc, 64, v38
 ; SDAG-NEXT:    v_subrev_i32_e32 v51, vcc, 64, v38
 ; SDAG-NEXT:    v_lshr_b64 v[22:23], v[4:5], v38
 ; SDAG-NEXT:    v_add_i32_e32 v50, vcc, -1, v37
 ; SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
-; SDAG-NEXT:    v_mov_b32_e32 v13, 0
-; SDAG-NEXT:    v_lshl_b64 v[24:25], v[4:5], v12
+; SDAG-NEXT:    v_mov_b32_e32 v15, 0
+; SDAG-NEXT:    v_lshl_b64 v[24:25], v[4:5], v14
 ; SDAG-NEXT:    v_lshr_b64 v[53:54], v[4:5], v51
 ; SDAG-NEXT:    v_addc_u32_e32 v51, vcc, -1, v36, vcc
-; SDAG-NEXT:    v_or_b32_e32 v12, v21, v25
+; SDAG-NEXT:    v_or_b32_e32 v14, v21, v25
 ; SDAG-NEXT:    v_or_b32_e32 v20, v20, v24
 ; SDAG-NEXT:    v_addc_u32_e32 v52, vcc, -1, v6, vcc
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v38
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, v54, v12, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v54, v14, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v20, v53, v20, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v25, 0, v23, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v24, 0, v22, s[4:5]
 ; SDAG-NEXT:    v_addc_u32_e32 v53, vcc, -1, v7, vcc
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v38
-; SDAG-NEXT:    v_cndmask_b32_e32 v23, v12, v9, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v23, v14, v9, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v22, v20, v8, vcc
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v21, 0
 ; SDAG-NEXT:  .LBB2_9: ; %udiv-do-while
 ; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SDAG-NEXT:    v_lshl_b64 v[24:25], v[24:25], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v12, 31, v23
+; SDAG-NEXT:    v_lshrrev_b32_e32 v14, 31, v23
 ; SDAG-NEXT:    v_lshl_b64 v[22:23], v[22:23], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v54, 31, v15
-; SDAG-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v55, 31, v11
-; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT:    v_or_b32_e32 v24, v24, v12
+; SDAG-NEXT:    v_lshrrev_b32_e32 v54, 31, v17
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v55, 31, v13
+; SDAG-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; SDAG-NEXT:    v_or_b32_e32 v24, v24, v14
 ; SDAG-NEXT:    v_or_b32_e32 v22, v22, v54
-; SDAG-NEXT:    v_or_b32_e32 v12, v14, v55
-; SDAG-NEXT:    v_or_b32_e32 v15, v19, v15
-; SDAG-NEXT:    v_or_b32_e32 v11, v21, v11
-; SDAG-NEXT:    v_or_b32_e32 v14, v18, v12
-; SDAG-NEXT:    v_sub_i32_e32 v12, vcc, v50, v22
-; SDAG-NEXT:    v_subb_u32_e32 v12, vcc, v51, v23, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v12, vcc, v52, v24, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v12, vcc, v53, v25, vcc
-; SDAG-NEXT:    v_ashrrev_i32_e32 v21, 31, v12
-; SDAG-NEXT:    v_and_b32_e32 v12, 1, v21
+; SDAG-NEXT:    v_or_b32_e32 v14, v16, v55
+; SDAG-NEXT:    v_or_b32_e32 v17, v19, v17
+; SDAG-NEXT:    v_or_b32_e32 v13, v21, v13
+; SDAG-NEXT:    v_or_b32_e32 v16, v18, v14
+; SDAG-NEXT:    v_sub_i32_e32 v14, vcc, v50, v22
+; SDAG-NEXT:    v_subb_u32_e32 v14, vcc, v51, v23, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v14, vcc, v52, v24, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v14, vcc, v53, v25, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v21, 31, v14
+; SDAG-NEXT:    v_and_b32_e32 v14, 1, v21
 ; SDAG-NEXT:    v_and_b32_e32 v54, v21, v7
 ; SDAG-NEXT:    v_and_b32_e32 v55, v21, v6
 ; SDAG-NEXT:    v_and_b32_e32 v40, v21, v36
@@ -1932,80 +1932,80 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_or_b32_e32 v54, v38, v48
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[54:55]
 ; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT:    v_or_b32_e32 v10, v20, v10
-; SDAG-NEXT:    v_mov_b32_e32 v21, v13
-; SDAG-NEXT:    v_mov_b32_e32 v20, v12
+; SDAG-NEXT:    v_or_b32_e32 v12, v20, v12
+; SDAG-NEXT:    v_mov_b32_e32 v21, v15
+; SDAG-NEXT:    v_mov_b32_e32 v20, v14
 ; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; SDAG-NEXT:    s_cbranch_execnz .LBB2_9
 ; SDAG-NEXT:  ; %bb.10: ; %Flow
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; SDAG-NEXT:  .LBB2_11: ; %Flow11
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v20, 31, v11
-; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT:    v_or_b32_e32 v14, v14, v20
-; SDAG-NEXT:    v_or_b32_e32 v19, v19, v15
-; SDAG-NEXT:    v_or_b32_e32 v13, v13, v11
-; SDAG-NEXT:    v_or_b32_e32 v18, v18, v14
-; SDAG-NEXT:    v_or_b32_e32 v12, v12, v10
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v20, 31, v13
+; SDAG-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v20
+; SDAG-NEXT:    v_or_b32_e32 v19, v19, v17
+; SDAG-NEXT:    v_or_b32_e32 v15, v15, v13
+; SDAG-NEXT:    v_or_b32_e32 v18, v18, v16
+; SDAG-NEXT:    v_or_b32_e32 v14, v14, v12
 ; SDAG-NEXT:  .LBB2_12: ; %Flow12
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT:    v_mul_lo_u32 v14, v33, v3
-; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0
-; SDAG-NEXT:    v_mul_lo_u32 v24, v27, v2
+; SDAG-NEXT:    v_mul_lo_u32 v13, v33, v11
+; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v33, v10, 0
+; SDAG-NEXT:    v_mul_lo_u32 v10, v27, v10
 ; SDAG-NEXT:    v_mul_lo_u32 v25, v34, v31
 ; SDAG-NEXT:    v_mul_lo_u32 v34, v32, v30
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0
-; SDAG-NEXT:    v_mov_b32_e32 v15, 0
-; SDAG-NEXT:    v_mul_lo_u32 v38, v12, v7
-; SDAG-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v12, v6, 0
-; SDAG-NEXT:    v_mul_lo_u32 v39, v13, v6
-; SDAG-NEXT:    v_mul_lo_u32 v19, v19, v37
-; SDAG-NEXT:    v_mul_lo_u32 v48, v18, v36
-; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v37, v12, 0
-; SDAG-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; SDAG-NEXT:    v_mov_b32_e32 v14, v3
-; SDAG-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15]
-; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v16, v2
-; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], v21, v38
-; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v24
-; SDAG-NEXT:    v_mov_b32_e32 v14, v22
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[14:15]
-; SDAG-NEXT:    v_xor_b32_e32 v24, v16, v28
-; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], v21, v39
-; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11]
-; SDAG-NEXT:    v_add_i32_e64 v22, s[4:5], v23, v3
-; SDAG-NEXT:    v_addc_u32_e64 v23, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v31, vcc, v17, v2, vcc
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21]
-; SDAG-NEXT:    v_mov_b32_e32 v14, v7
-; SDAG-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v36, v12, v[14:15]
+; SDAG-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v31, v33, 0
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
+; SDAG-NEXT:    v_mul_lo_u32 v38, v14, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v14, v6, 0
+; SDAG-NEXT:    v_mul_lo_u32 v39, v15, v6
+; SDAG-NEXT:    v_mul_lo_u32 v48, v19, v37
+; SDAG-NEXT:    v_mul_lo_u32 v49, v18, v36
+; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v37, v14, 0
+; SDAG-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; SDAG-NEXT:    v_mov_b32_e32 v19, v17
+; SDAG-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v30, v33, v[19:20]
+; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
+; SDAG-NEXT:    v_add_i32_e64 v13, s[4:5], v22, v38
+; SDAG-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v10
+; SDAG-NEXT:    v_mov_b32_e32 v19, v23
+; SDAG-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v31, v27, v[19:20]
+; SDAG-NEXT:    v_xor_b32_e32 v23, v0, v28
+; SDAG-NEXT:    v_add_i32_e64 v22, s[4:5], v13, v39
+; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[11:12]
+; SDAG-NEXT:    v_add_i32_e64 v12, s[4:5], v24, v17
+; SDAG-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v24, vcc, v1, v16, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v18, v37, v[21:22]
+; SDAG-NEXT:    v_mov_b32_e32 v19, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v36, v14, v[19:20]
 ; SDAG-NEXT:    v_add_i32_e64 v7, s[4:5], v25, v11
-; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v30, v27, v[22:23]
-; SDAG-NEXT:    v_xor_b32_e32 v18, v31, v29
-; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v19, v3
-; SDAG-NEXT:    v_mov_b32_e32 v14, v16
-; SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[14:15]
+; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v30, v27, v[12:13]
+; SDAG-NEXT:    v_xor_b32_e32 v18, v24, v29
+; SDAG-NEXT:    v_add_i32_e64 v1, s[4:5], v48, v1
+; SDAG-NEXT:    v_mov_b32_e32 v19, v16
+; SDAG-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v37, v15, v[19:20]
 ; SDAG-NEXT:    v_add_i32_e64 v7, s[4:5], v34, v7
-; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v48, v3
-; SDAG-NEXT:    v_add_i32_e64 v15, s[4:5], v17, v15
-; SDAG-NEXT:    v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
+; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], v49, v1
+; SDAG-NEXT:    v_add_i32_e64 v16, s[4:5], v17, v14
+; SDAG-NEXT:    v_addc_u32_e64 v17, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_add_i32_e64 v1, s[4:5], v11, v10
 ; SDAG-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v0, v10, vcc
-; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v36, v13, v[15:16]
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v7, v0, v28
-; SDAG-NEXT:    v_add_i32_e32 v10, vcc, v10, v2
-; SDAG-NEXT:    v_addc_u32_e32 v11, vcc, v11, v3, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v3, v1, v29
-; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v24, v28
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v2, v1, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v36, v15, v[16:17]
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v7, v10, v28
+; SDAG-NEXT:    v_add_i32_e32 v10, vcc, v1, v0
+; SDAG-NEXT:    v_addc_u32_e32 v11, vcc, v2, v19, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v3, v3, v29
+; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v23, v28
 ; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v18, v29, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v7, v28, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v29, vcc
 ; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v8, v6
-; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v9, v14, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v9, v13, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v6, v6, v26
 ; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v4, v10, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v7, v7, v35
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index a025c36f620c7..7e233e648cdbc 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -211,22 +211,23 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, -1
+; SI-NEXT:    v_mov_b32_e32 v1, 0x432fffff
 ; SI-NEXT:    s_brev_b32 s8, -2
-; SI-NEXT:    v_mov_b32_e32 v1, 0x43300000
-; SI-NEXT:    v_mov_b32_e32 v0, 0
-; SI-NEXT:    v_mov_b32_e32 v2, -1
-; SI-NEXT:    v_mov_b32_e32 v3, 0x432fffff
+; SI-NEXT:    v_mov_b32_e32 v3, 0x43300000
+; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v4, s3
+; SI-NEXT:    v_bfi_b32 v3, s8, v3, v4
 ; SI-NEXT:    v_mov_b32_e32 v6, s3
-; SI-NEXT:    v_bfi_b32 v1, s8, v1, v6
 ; SI-NEXT:    v_mov_b32_e32 v7, s2
-; SI-NEXT:    v_add_f64 v[4:5], s[2:3], v[0:1]
-; SI-NEXT:    v_add_f64 v[0:1], v[4:5], -v[0:1]
-; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[2:3]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; SI-NEXT:    v_add_f64 v[4:5], s[2:3], v[2:3]
+; SI-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[0:1]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v7, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -270,19 +271,21 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s9, 0x432fffff
 ; SI-NEXT:    s_brev_b32 s10, -2
 ; SI-NEXT:    v_mov_b32_e32 v6, 0x43300000
-; SI-NEXT:    s_mov_b32 s9, 0x432fffff
 ; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    v_mov_b32_e32 v4, s8
 ; SI-NEXT:    v_mov_b32_e32 v5, s9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_bfi_b32 v1, s10, v6, v1
 ; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_bfi_b32 v1, s10, v6, v7
 ; SI-NEXT:    v_mov_b32_e32 v8, s6
 ; SI-NEXT:    v_mov_b32_e32 v9, s5
-; SI-NEXT:    v_mov_b32_e32 v10, s4
+; SI-NEXT:    v_mov_b32_e32 v10, s5
+; SI-NEXT:    v_mov_b32_e32 v11, s4
 ; SI-NEXT:    v_add_f64 v[2:3], s[6:7], v[0:1]
 ; SI-NEXT:    v_add_f64 v[2:3], v[2:3], -v[0:1]
 ; SI-NEXT:    v_bfi_b32 v1, s10, v6, v9
@@ -292,8 +295,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
 ; SI-NEXT:    v_add_f64 v[6:7], s[4:5], v[0:1]
 ; SI-NEXT:    v_add_f64 v[0:1], v[6:7], -v[0:1]
 ; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5]
-; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -347,26 +350,30 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x11
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s13, 0x432fffff
 ; SI-NEXT:    s_brev_b32 s14, -2
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x43300000
-; SI-NEXT:    s_mov_b32 s13, 0x432fffff
 ; SI-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-NEXT:    s_mov_b32 s12, s10
 ; SI-NEXT:    v_mov_b32_e32 v8, s12
 ; SI-NEXT:    v_mov_b32_e32 v9, s13
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_bfi_b32 v5, s14, v10, v0
 ; SI-NEXT:    v_mov_b32_e32 v2, s3
-; SI-NEXT:    v_bfi_b32 v5, s14, v10, v2
 ; SI-NEXT:    v_mov_b32_e32 v6, s2
+; SI-NEXT:    v_mov_b32_e32 v3, s1
 ; SI-NEXT:    v_mov_b32_e32 v7, s1
 ; SI-NEXT:    v_mov_b32_e32 v11, s0
 ; SI-NEXT:    v_mov_b32_e32 v12, s7
-; SI-NEXT:    v_mov_b32_e32 v13, s6
-; SI-NEXT:    v_mov_b32_e32 v14, s5
-; SI-NEXT:    v_mov_b32_e32 v15, s4
+; SI-NEXT:    v_mov_b32_e32 v13, s7
+; SI-NEXT:    v_mov_b32_e32 v14, s6
+; SI-NEXT:    v_mov_b32_e32 v15, s5
+; SI-NEXT:    v_mov_b32_e32 v16, s5
+; SI-NEXT:    v_mov_b32_e32 v17, s4
 ; SI-NEXT:    v_add_f64 v[0:1], s[2:3], v[4:5]
 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
-; SI-NEXT:    v_bfi_b32 v5, s14, v10, v7
+; SI-NEXT:    v_bfi_b32 v5, s14, v10, v3
 ; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9]
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v1, v2, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v6, vcc
@@ -378,15 +385,15 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
 ; SI-NEXT:    v_add_f64 v[6:7], s[6:7], v[4:5]
 ; SI-NEXT:    v_add_f64 v[6:7], v[6:7], -v[4:5]
-; SI-NEXT:    v_bfi_b32 v5, s14, v10, v14
+; SI-NEXT:    v_bfi_b32 v5, s14, v10, v15
 ; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v13, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v13, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
 ; SI-NEXT:    v_add_f64 v[10:11], s[4:5], v[4:5]
 ; SI-NEXT:    v_add_f64 v[4:5], v[10:11], -v[4:5]
 ; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v14, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v15, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v16, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v17, vcc
 ; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 6f91222b2f396..a37e7dc5e31a9 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -4307,22 +4307,30 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
 ; SI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    v_readfirstlane_b32 s0, v4
-; SI-NEXT:    v_readfirstlane_b32 s1, v5
-; SI-NEXT:    s_bfe_u32 s2, s1, 0xb0014
-; SI-NEXT:    s_add_i32 s8, s2, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    s_mov_b32 s2, s6
-; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s8
-; SI-NEXT:    s_andn2_b64 s[2:3], s[0:1], s[2:3]
-; SI-NEXT:    s_and_b32 s9, s1, 0x80000000
+; SI-NEXT:    v_readfirstlane_b32 s2, v4
+; SI-NEXT:    v_readfirstlane_b32 s3, v5
+; SI-NEXT:    s_bfe_u32 s0, s3, 0xb0014
+; SI-NEXT:    s_add_i32 s8, s0, 0xfffffc01
+; SI-NEXT:    s_mov_b32 s1, 0xfffff
+; SI-NEXT:    s_mov_b32 s0, s6
+; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s8
+; SI-NEXT:    v_not_b32_e32 v6, s0
+; SI-NEXT:    v_and_b32_e32 v4, v4, v6
+; SI-NEXT:    v_not_b32_e32 v6, s1
+; SI-NEXT:    v_and_b32_e32 v5, v5, v6
+; SI-NEXT:    s_and_b32 s0, s3, 0x80000000
 ; SI-NEXT:    s_cmp_lt_i32 s8, 0
-; SI-NEXT:    s_cselect_b32 s2, 0, s2
-; SI-NEXT:    s_cselect_b32 s3, s9, s3
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
+; SI-NEXT:    v_mov_b32_e32 v6, s0
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; SI-NEXT:    s_cmp_gt_i32 s8, 51
-; SI-NEXT:    s_cselect_b32 s1, s1, s3
-; SI-NEXT:    s_cselect_b32 s0, s0, s2
-; SI-NEXT:    v_fma_f64 v[0:1], -s[0:1], v[2:3], v[0:1]
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v6, s3
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; SI-NEXT:    v_mov_b32_e32 v6, s2
+; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -4585,22 +4593,30 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
 ; SI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    v_readfirstlane_b32 s0, v4
-; SI-NEXT:    v_readfirstlane_b32 s1, v5
-; SI-NEXT:    s_bfe_u32 s2, s1, 0xb0014
-; SI-NEXT:    s_add_i32 s8, s2, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    s_mov_b32 s2, s6
-; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s8
-; SI-NEXT:    s_andn2_b64 s[2:3], s[0:1], s[2:3]
-; SI-NEXT:    s_and_b32 s9, s1, 0x80000000
+; SI-NEXT:    v_readfirstlane_b32 s2, v4
+; SI-NEXT:    v_readfirstlane_b32 s3, v5
+; SI-NEXT:    s_bfe_u32 s0, s3, 0xb0014
+; SI-NEXT:    s_add_i32 s8, s0, 0xfffffc01
+; SI-NEXT:    s_mov_b32 s1, 0xfffff
+; SI-NEXT:    s_mov_b32 s0, s6
+; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s8
+; SI-NEXT:    v_not_b32_e32 v6, s0
+; SI-NEXT:    v_and_b32_e32 v4, v4, v6
+; SI-NEXT:    v_not_b32_e32 v6, s1
+; SI-NEXT:    v_and_b32_e32 v5, v5, v6
+; SI-NEXT:    s_and_b32 s0, s3, 0x80000000
 ; SI-NEXT:    s_cmp_lt_i32 s8, 0
-; SI-NEXT:    s_cselect_b32 s2, 0, s2
-; SI-NEXT:    s_cselect_b32 s3, s9, s3
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
+; SI-NEXT:    v_mov_b32_e32 v6, s0
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; SI-NEXT:    s_cmp_gt_i32 s8, 51
-; SI-NEXT:    s_cselect_b32 s1, s1, s3
-; SI-NEXT:    s_cselect_b32 s0, s0, s2
-; SI-NEXT:    v_fma_f64 v[0:1], -s[0:1], v[2:3], v[0:1]
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v6, s3
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; SI-NEXT:    v_mov_b32_e32 v6, s2
+; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 6ae058b38e74f..c43a9ffa3d57d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -492,21 +492,21 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    v_mul_hi_i32 v1, s1, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_mul_hi_u32 v3, s1, v2
 ; SI-NEXT:    s_mul_i32 s4, s1, s2
-; SI-NEXT:    v_mov_b32_e32 v2, s3
-; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
-; SI-NEXT:    s_mul_i32 s5, s0, s3
 ; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
-; SI-NEXT:    v_mul_hi_i32 v2, s1, v2
+; SI-NEXT:    s_mul_i32 s5, s0, s3
+; SI-NEXT:    v_mul_hi_u32 v2, s0, v2
 ; SI-NEXT:    s_mul_i32 s6, s1, s3
 ; SI-NEXT:    s_mul_i32 s8, s0, s2
-; SI-NEXT:    v_readfirstlane_b32 s9, v1
-; SI-NEXT:    v_readfirstlane_b32 s10, v3
-; SI-NEXT:    v_readfirstlane_b32 s11, v0
-; SI-NEXT:    v_readfirstlane_b32 s12, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
+; SI-NEXT:    v_readfirstlane_b32 s9, v3
+; SI-NEXT:    v_readfirstlane_b32 s10, v0
+; SI-NEXT:    v_readfirstlane_b32 s11, v2
+; SI-NEXT:    v_readfirstlane_b32 s12, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s5, v2
 ; SI-NEXT:    s_add_u32 s5, s11, s5
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, s4, v0
 ; SI-NEXT:    s_addc_u32 s10, 0, s10
@@ -540,31 +540,31 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s7, s0, s3
-; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
-; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
-; GFX9-NEXT:    s_add_u32 s9, s8, s7
+; GFX9-NEXT:    s_mul_i32 s8, s0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s2
+; GFX9-NEXT:    s_mul_hi_u32 s7, s0, s3
+; GFX9-NEXT:    s_add_u32 s10, s9, s8
 ; GFX9-NEXT:    s_mul_i32 s6, s1, s2
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX9-NEXT:    s_add_u32 s9, s9, s6
-; GFX9-NEXT:    s_mul_hi_i32 s10, s1, s3
-; GFX9-NEXT:    s_addc_u32 s4, s5, s4
-; GFX9-NEXT:    s_addc_u32 s5, s10, 0
-; GFX9-NEXT:    s_mul_i32 s9, s1, s3
-; GFX9-NEXT:    s_add_u32 s4, s4, s9
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_sub_u32 s9, s4, s2
-; GFX9-NEXT:    s_subb_u32 s10, s5, 0
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
+; GFX9-NEXT:    s_mul_hi_u32 s5, s1, s2
+; GFX9-NEXT:    s_add_u32 s10, s10, s6
+; GFX9-NEXT:    s_mul_hi_i32 s4, s1, s3
+; GFX9-NEXT:    s_addc_u32 s5, s7, s5
+; GFX9-NEXT:    s_addc_u32 s4, s4, 0
+; GFX9-NEXT:    s_mul_i32 s7, s1, s3
+; GFX9-NEXT:    s_add_u32 s5, s5, s7
+; GFX9-NEXT:    s_addc_u32 s4, 0, s4
+; GFX9-NEXT:    s_sub_u32 s7, s5, s2
+; GFX9-NEXT:    s_subb_u32 s10, s4, 0
 ; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
-; GFX9-NEXT:    s_cselect_b32 s1, s10, s5
-; GFX9-NEXT:    s_sub_u32 s9, s4, s0
+; GFX9-NEXT:    s_cselect_b32 s1, s10, s4
+; GFX9-NEXT:    s_cselect_b32 s4, s7, s5
+; GFX9-NEXT:    s_sub_u32 s7, s4, s0
 ; GFX9-NEXT:    s_subb_u32 s5, s1, 0
 ; GFX9-NEXT:    s_cmp_lt_i32 s3, 0
 ; GFX9-NEXT:    s_cselect_b32 s5, s5, s1
-; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
-; GFX9-NEXT:    s_add_i32 s1, s8, s7
+; GFX9-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX9-NEXT:    s_add_i32 s1, s9, s8
 ; GFX9-NEXT:    s_add_i32 s1, s1, s6
 ; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX9-NEXT:    s_mov_b32 s7, s6
@@ -581,33 +581,33 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mul_i32 s7, s0, s3
-; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT:    s_mul_i32 s6, s1, s2
-; GFX10-NEXT:    s_add_u32 s11, s8, s7
-; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT:    s_addc_u32 s5, 0, s5
-; GFX10-NEXT:    s_mul_hi_i32 s9, s1, s3
-; GFX10-NEXT:    s_add_u32 s11, s11, s6
+; GFX10-NEXT:    s_mul_hi_u32 s6, s0, s2
+; GFX10-NEXT:    s_mul_i32 s9, s0, s3
+; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s3
+; GFX10-NEXT:    s_mul_i32 s7, s1, s2
+; GFX10-NEXT:    s_add_u32 s11, s6, s9
+; GFX10-NEXT:    s_mul_hi_u32 s5, s1, s2
+; GFX10-NEXT:    s_addc_u32 s8, 0, s8
+; GFX10-NEXT:    s_mul_hi_i32 s4, s1, s3
+; GFX10-NEXT:    s_add_u32 s11, s11, s7
 ; GFX10-NEXT:    s_mul_i32 s10, s1, s3
-; GFX10-NEXT:    s_addc_u32 s4, s5, s4
-; GFX10-NEXT:    s_addc_u32 s5, s9, 0
-; GFX10-NEXT:    s_add_u32 s4, s4, s10
-; GFX10-NEXT:    s_addc_u32 s5, 0, s5
-; GFX10-NEXT:    s_sub_u32 s9, s4, s2
-; GFX10-NEXT:    s_subb_u32 s10, s5, 0
+; GFX10-NEXT:    s_addc_u32 s5, s8, s5
+; GFX10-NEXT:    s_addc_u32 s4, s4, 0
+; GFX10-NEXT:    s_add_u32 s5, s5, s10
+; GFX10-NEXT:    s_addc_u32 s4, 0, s4
+; GFX10-NEXT:    s_sub_u32 s8, s5, s2
+; GFX10-NEXT:    s_subb_u32 s10, s4, 0
 ; GFX10-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX10-NEXT:    s_cselect_b32 s1, s9, s4
-; GFX10-NEXT:    s_cselect_b32 s4, s10, s5
-; GFX10-NEXT:    s_sub_u32 s9, s1, s0
+; GFX10-NEXT:    s_cselect_b32 s1, s8, s5
+; GFX10-NEXT:    s_cselect_b32 s4, s10, s4
+; GFX10-NEXT:    s_sub_u32 s8, s1, s0
 ; GFX10-NEXT:    s_subb_u32 s5, s4, 0
 ; GFX10-NEXT:    s_cmp_lt_i32 s3, 0
 ; GFX10-NEXT:    s_mul_i32 s0, s0, s2
 ; GFX10-NEXT:    s_cselect_b32 s5, s5, s4
-; GFX10-NEXT:    s_cselect_b32 s4, s9, s1
-; GFX10-NEXT:    s_add_i32 s1, s8, s7
-; GFX10-NEXT:    s_add_i32 s1, s1, s6
+; GFX10-NEXT:    s_cselect_b32 s4, s8, s1
+; GFX10-NEXT:    s_add_i32 s1, s6, s9
+; GFX10-NEXT:    s_add_i32 s1, s1, s7
 ; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX10-NEXT:    s_mov_b32 s7, s6
 ; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], s[6:7]
@@ -622,34 +622,34 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mul_i32 s7, s0, s3
-; GFX11-NEXT:    s_mul_hi_u32 s8, s0, s2
-; GFX11-NEXT:    s_mul_hi_u32 s5, s0, s3
-; GFX11-NEXT:    s_mul_i32 s6, s1, s2
-; GFX11-NEXT:    s_add_u32 s11, s8, s7
-; GFX11-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX11-NEXT:    s_addc_u32 s5, 0, s5
-; GFX11-NEXT:    s_mul_hi_i32 s9, s1, s3
-; GFX11-NEXT:    s_add_u32 s11, s11, s6
+; GFX11-NEXT:    s_mul_hi_u32 s6, s0, s2
+; GFX11-NEXT:    s_mul_i32 s9, s0, s3
+; GFX11-NEXT:    s_mul_hi_u32 s8, s0, s3
+; GFX11-NEXT:    s_mul_i32 s7, s1, s2
+; GFX11-NEXT:    s_add_u32 s11, s6, s9
+; GFX11-NEXT:    s_mul_hi_u32 s5, s1, s2
+; GFX11-NEXT:    s_addc_u32 s8, 0, s8
+; GFX11-NEXT:    s_mul_hi_i32 s4, s1, s3
+; GFX11-NEXT:    s_add_u32 s11, s11, s7
 ; GFX11-NEXT:    s_mul_i32 s10, s1, s3
-; GFX11-NEXT:    s_addc_u32 s4, s5, s4
-; GFX11-NEXT:    s_addc_u32 s5, s9, 0
-; GFX11-NEXT:    s_add_u32 s4, s4, s10
-; GFX11-NEXT:    s_addc_u32 s5, 0, s5
-; GFX11-NEXT:    s_sub_u32 s9, s4, s2
-; GFX11-NEXT:    s_subb_u32 s10, s5, 0
+; GFX11-NEXT:    s_addc_u32 s5, s8, s5
+; GFX11-NEXT:    s_addc_u32 s4, s4, 0
+; GFX11-NEXT:    s_add_u32 s5, s5, s10
+; GFX11-NEXT:    s_addc_u32 s4, 0, s4
+; GFX11-NEXT:    s_sub_u32 s8, s5, s2
+; GFX11-NEXT:    s_subb_u32 s10, s4, 0
 ; GFX11-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX11-NEXT:    s_cselect_b32 s1, s9, s4
-; GFX11-NEXT:    s_cselect_b32 s4, s10, s5
-; GFX11-NEXT:    s_sub_u32 s9, s1, s0
+; GFX11-NEXT:    s_cselect_b32 s1, s8, s5
+; GFX11-NEXT:    s_cselect_b32 s4, s10, s4
+; GFX11-NEXT:    s_sub_u32 s8, s1, s0
 ; GFX11-NEXT:    s_subb_u32 s5, s4, 0
 ; GFX11-NEXT:    s_cmp_lt_i32 s3, 0
 ; GFX11-NEXT:    s_mul_i32 s0, s0, s2
 ; GFX11-NEXT:    s_cselect_b32 s5, s5, s4
-; GFX11-NEXT:    s_cselect_b32 s4, s9, s1
-; GFX11-NEXT:    s_add_i32 s1, s8, s7
+; GFX11-NEXT:    s_cselect_b32 s4, s8, s1
+; GFX11-NEXT:    s_add_i32 s1, s6, s9
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s1, s1, s6
+; GFX11-NEXT:    s_add_i32 s1, s1, s7
 ; GFX11-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mov_b32 s7, s6
@@ -666,17 +666,17 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mul_hi_u32 s4, s0, s2
 ; GFX12-NEXT:    s_mul_hi_u32 s7, s0, s3
 ; GFX12-NEXT:    s_mul_i32 s6, s0, s3
-; GFX12-NEXT:    s_mul_hi_u32 s4, s0, s2
-; GFX12-NEXT:    s_mul_i32 s10, s1, s2
+; GFX12-NEXT:    s_mul_i32 s13, s1, s2
 ; GFX12-NEXT:    s_add_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX12-NEXT:    s_mul_hi_u32 s9, s1, s2
-; GFX12-NEXT:    s_mul_hi_i32 s11, s1, s3
-; GFX12-NEXT:    s_add_co_u32 s4, s6, s10
-; GFX12-NEXT:    s_add_co_ci_u32 s4, s7, s9
+; GFX12-NEXT:    s_mul_hi_u32 s12, s1, s2
+; GFX12-NEXT:    s_mul_hi_i32 s9, s1, s3
+; GFX12-NEXT:    s_add_co_u32 s4, s6, s13
+; GFX12-NEXT:    s_add_co_ci_u32 s4, s7, s12
 ; GFX12-NEXT:    s_mul_i32 s8, s1, s3
-; GFX12-NEXT:    s_add_co_ci_u32 s9, s11, 0
+; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, 0
 ; GFX12-NEXT:    s_cmp_lt_i32 s1, 0
 ; GFX12-NEXT:    s_add_nc_u64 s[6:7], s[4:5], s[8:9]
 ; GFX12-NEXT:    s_mov_b32 s4, s2
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 3d9c2a29cb9c1..2292105c14bc5 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -463,41 +463,39 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a,
 define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 {
 ; SI-LABEL: test_smul24_i33:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
+; SI-NEXT:    s_load_dword s4, s[4:5], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_bfe_i32 s0, s8, 0x180000
-; SI-NEXT:    s_bfe_i32 s1, s2, 0x180000
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    s_mul_i32 s0, s1, s0
-; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    s_bfe_i32 s5, s6, 0x180000
+; SI-NEXT:    s_bfe_i32 s4, s4, 0x180000
+; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    s_mul_i32 s5, s4, s5
+; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s4, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
 ; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], 31
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_smul24_i33:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bfe_i32 s2, s2, 0x180000
-; VI-NEXT:    s_bfe_i32 s3, s4, 0x180000
+; VI-NEXT:    s_bfe_i32 s3, s3, 0x180000
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, s2, v0
 ; VI-NEXT:    v_mul_i32_i24_e32 v0, s2, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
-; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    v_ashrrev_i64 v[0:1], 31, v[0:1]
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_smul24_i33:
@@ -576,32 +574,30 @@ entry:
 define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
 ; SI-LABEL: test_smulhi24_i33:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
+; SI-NEXT:    s_load_dword s7, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s2, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s7, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 1, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_smulhi24_i33:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
+; VI-NEXT:    s_load_dword s7, s[4:5], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s2, v0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s7, v0
 ; VI-NEXT:    v_and_b32_e32 v0, 1, v0
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_smulhi24_i33:
diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index f497752994852..1d878a02d2525 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -846,8 +846,7 @@ define i64 @poison_should_freeze(i1 %cond1, i32 %val, i16 %val2, i64 %a, i64 %b)
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mov_b32_e32 v7, 0x5040100
-; GCN-NEXT:    v_perm_b32 v2, v2, s4, v7
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
index 8d0e00383d692..dcd7ed441fbae 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
@@ -3967,8 +3967,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -4071,8 +4071,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -4179,8 +4179,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-SDAG-NEXT:    s_nop 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-SDAG-NEXT:    s_nop 1
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -4287,8 +4287,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[2:3], v[6:7]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4391,8 +4391,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[6:7]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -4506,8 +4506,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
index f15ecf014ab0b..515d36f9967a8 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
@@ -3966,8 +3966,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -4070,8 +4070,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -4178,8 +4178,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-SDAG-NEXT:    s_nop 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-SDAG-NEXT:    s_nop 1
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -4286,8 +4286,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[2:3], v[6:7]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4390,8 +4390,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[6:7]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -4505,8 +4505,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
index e62165cb933c5..fba4bd516183c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
@@ -3843,8 +3843,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -3947,8 +3947,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -4055,8 +4055,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-SDAG-NEXT:    s_nop 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-SDAG-NEXT:    s_nop 1
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -4163,8 +4163,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[2:3], v[6:7]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4267,8 +4267,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[6:7]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -4382,8 +4382,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
index 83ecaaa7e0846..6ffff5968d4e0 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
@@ -3579,8 +3579,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -3683,8 +3683,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -3791,8 +3791,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-SDAG-NEXT:    s_nop 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-SDAG-NEXT:    s_nop 1
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -3899,8 +3899,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[2:3], v[6:7]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4003,8 +4003,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[6:7]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -4118,8 +4118,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/RISCV/pr66603.ll b/llvm/test/CodeGen/RISCV/pr66603.ll
index cfe8ceed12582..eb3d1a3b916e2 100644
--- a/llvm/test/CodeGen/RISCV/pr66603.ll
+++ b/llvm/test/CodeGen/RISCV/pr66603.ll
@@ -7,15 +7,11 @@ define i32 @PR66603(double %x) nounwind {
 ; RV32-LABEL: PR66603:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    fcvt.w.d a0, fa0, rtz
-; RV32-NEXT:    slli a0, a0, 24
-; RV32-NEXT:    srai a0, a0, 24
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: PR66603:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    fcvt.l.d a0, fa0, rtz
-; RV64-NEXT:    slli a0, a0, 56
-; RV64-NEXT:    srai a0, a0, 56
 ; RV64-NEXT:    ret
   %as_i8 = fptosi double %x to i8
   %frozen_i8 = freeze i8 %as_i8
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index 8a6a30318ae58..d18b64f12e527 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -74,40 +74,8 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-NEXT:    rosbg %r2, %r3, 32, 61, 2
 ; CHECK-NEXT:    rosbg %r2, %r5, 32, 62, 1
 ; CHECK-NEXT:    or %r2, %r14
-; CHECK-NEXT:    vlgvb %r4, %v0, 1
-; CHECK-NEXT:    vlgvb %r3, %v0, 0
-; CHECK-NEXT:    risbg %r3, %r3, 48, 176, 15
-; CHECK-NEXT:    rosbg %r3, %r4, 49, 49, 14
-; CHECK-NEXT:    vlgvb %r4, %v0, 2
-; CHECK-NEXT:    rosbg %r3, %r4, 50, 50, 13
-; CHECK-NEXT:    vlgvb %r4, %v0, 3
-; CHECK-NEXT:    rosbg %r3, %r4, 51, 51, 12
-; CHECK-NEXT:    vlgvb %r4, %v0, 4
-; CHECK-NEXT:    rosbg %r3, %r4, 52, 52, 11
-; CHECK-NEXT:    vlgvb %r4, %v0, 5
-; CHECK-NEXT:    rosbg %r3, %r4, 53, 53, 10
-; CHECK-NEXT:    vlgvb %r4, %v0, 6
-; CHECK-NEXT:    rosbg %r3, %r4, 54, 54, 9
-; CHECK-NEXT:    vlgvb %r4, %v0, 7
-; CHECK-NEXT:    rosbg %r3, %r4, 55, 55, 8
-; CHECK-NEXT:    vlgvb %r4, %v0, 8
-; CHECK-NEXT:    rosbg %r3, %r4, 56, 56, 7
-; CHECK-NEXT:    vlgvb %r4, %v0, 9
-; CHECK-NEXT:    rosbg %r3, %r4, 57, 57, 6
-; CHECK-NEXT:    vlgvb %r4, %v0, 10
-; CHECK-NEXT:    rosbg %r3, %r4, 58, 58, 5
-; CHECK-NEXT:    vlgvb %r4, %v0, 11
-; CHECK-NEXT:    rosbg %r3, %r4, 59, 59, 4
-; CHECK-NEXT:    vlgvb %r4, %v0, 12
-; CHECK-NEXT:    rosbg %r3, %r4, 60, 60, 3
-; CHECK-NEXT:    vlgvb %r4, %v0, 13
-; CHECK-NEXT:    rosbg %r3, %r4, 61, 61, 2
-; CHECK-NEXT:    vlgvb %r4, %v0, 14
-; CHECK-NEXT:    rosbg %r3, %r4, 62, 62, 1
-; CHECK-NEXT:    vlgvb %r4, %v0, 15
-; CHECK-NEXT:    rosbg %r3, %r4, 63, 63, 0
 ; CHECK-NEXT:    xilf %r3, 4294967295
-; CHECK-NEXT:    or %r3, %r2
+; CHECK-NEXT:    rosbg %r3, %r2, 48, 63, 0
 ; CHECK-NEXT:    tmll %r3, 65535
 ; CHECK-NEXT:    ipm %r2
 ; CHECK-NEXT:    afi %r2, -268435456
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll
index 080c6c1a1efdc..27bc1e76a7ee2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll
@@ -2902,7 +2902,7 @@ define arm_aapcs_vfpcc <8 x half> @faddqr_v8f16_y(<8 x half> %x, half %y, i32 %n
 ; CHECK-NEXT:    vctp.16 r0
 ; CHECK-NEXT:    vdup.16 q1, r1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vaddt.f16 q1, q0, r1
+; CHECK-NEXT:    vaddt.f16 q1, q1, q0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -2978,7 +2978,7 @@ define arm_aapcs_vfpcc <8 x half> @fmulqr_v8f16_y(<8 x half> %x, half %y, i32 %n
 ; CHECK-NEXT:    vctp.16 r0
 ; CHECK-NEXT:    vdup.16 q1, r1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.f16 q1, q0, r1
+; CHECK-NEXT:    vmult.f16 q1, q1, q0
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index 03f283a57a217..c60d9a3ff17d3 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -6,8 +6,7 @@
 define <8 x i16> @zext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: zext_8x8mem_to_8x16:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; KNL-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; KNL-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; KNL-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; KNL-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
@@ -22,8 +21,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone
 ;
 ; AVX512DQNOBW-LABEL: zext_8x8mem_to_8x16:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX512DQNOBW-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; AVX512DQNOBW-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; AVX512DQNOBW-NEXT:    vpand %xmm1, %xmm0, %xmm0
@@ -37,8 +35,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone
 define <8 x i16> @sext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone {
 ; KNL-LABEL: sext_8x8mem_to_8x16:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; KNL-NEXT:    vpmovsxbw %xmm1, %xmm1
+; KNL-NEXT:    vpmovsxbw (%rdi), %xmm1
 ; KNL-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; KNL-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
@@ -53,8 +50,7 @@ define <8 x i16> @sext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone
 ;
 ; AVX512DQNOBW-LABEL: sext_8x8mem_to_8x16:
 ; AVX512DQNOBW:       # %bb.0:
-; AVX512DQNOBW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQNOBW-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX512DQNOBW-NEXT:    vpmovsxbw (%rdi), %xmm1
 ; AVX512DQNOBW-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; AVX512DQNOBW-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; AVX512DQNOBW-NEXT:    vpand %xmm1, %xmm0, %xmm0
@@ -214,7 +210,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; KNL-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; KNL-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; KNL-NEXT:    vpsllw $15, %ymm0, %ymm0
 ; KNL-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; KNL-NEXT:    vpsllw $15, %ymm1, %ymm1
@@ -237,7 +233,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
 ; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512DQNOBW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQNOBW-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512DQNOBW-NEXT:    vpsllw $15, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpsllw $15, %ymm1, %ymm1
@@ -257,9 +253,9 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT:    vpmovsxbw (%rdi), %ymm2
-; KNL-NEXT:    vpmovsxbw 16(%rdi), %ymm3
-; KNL-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; KNL-NEXT:    vpmovsxbw 16(%rdi), %ymm2
+; KNL-NEXT:    vpmovsxbw (%rdi), %ymm3
+; KNL-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; KNL-NEXT:    vpsllw $15, %ymm0, %ymm0
 ; KNL-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; KNL-NEXT:    vpsllw $15, %ymm1, %ymm1
@@ -280,9 +276,9 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
 ; AVX512DQNOBW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
 ; AVX512DQNOBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT:    vpmovsxbw (%rdi), %ymm2
-; AVX512DQNOBW-NEXT:    vpmovsxbw 16(%rdi), %ymm3
-; AVX512DQNOBW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQNOBW-NEXT:    vpmovsxbw 16(%rdi), %ymm2
+; AVX512DQNOBW-NEXT:    vpmovsxbw (%rdi), %ymm3
+; AVX512DQNOBW-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512DQNOBW-NEXT:    vpsllw $15, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; AVX512DQNOBW-NEXT:    vpsllw $15, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index e223765eb887b..7e3a902044615 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -864,12 +864,11 @@ define i32 @freeze_ssubo(i32 %a0, i32 %a1, i8 %a2, i8 %a3) nounwind {
 ; X86-LABEL: freeze_ssubo:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    addb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    setb %cl
-; X86-NEXT:    andl $1, %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setb %dl
+; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
@@ -896,12 +895,11 @@ define i32 @freeze_usubo(i32 %a0, i32 %a1, i8 %a2, i8 %a3) nounwind {
 ; X86-LABEL: freeze_usubo:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    addb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    setb %cl
-; X86-NEXT:    andl $1, %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setb %dl
+; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/freeze.ll b/llvm/test/CodeGen/X86/freeze.ll
index 38e3e23f7caac..a5549be92e793 100644
--- a/llvm/test/CodeGen/X86/freeze.ll
+++ b/llvm/test/CodeGen/X86/freeze.ll
@@ -96,8 +96,6 @@ define i32 @freeze_anonstruct() {
 define i32 @freeze_anonstruct2() {
 ; X86ASM-LABEL: freeze_anonstruct2:
 ; X86ASM:       # %bb.0:
-; X86ASM-NEXT:    movzwl %ax, %eax
-; X86ASM-NEXT:    addl %eax, %eax
 ; X86ASM-NEXT:    retq
   %y1 = freeze {i32, i16} undef
   %v1 = extractvalue {i32, i16} %y1, 0
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index a4750b4cd4ad0..b1237b31660c2 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -863,20 +863,18 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
-; AVX512F-NEXT:    vpand %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsubb %ymm0, %ymm7, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
+; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT:    vpsubb %ymm0, %ymm5, %ymm5
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm4 & (zmm1 ^ zmm0))
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg:
@@ -895,20 +893,18 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm0, %ymm7, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
+; AVX512VL-FALLBACK-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm5, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm0, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm4 & (zmm1 ^ zmm0))
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
 ; AVX512BW-LABEL: vec512_i8_signed_mem_reg:
@@ -953,19 +949,17 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT:    vpsubb %ymm3, %ymm7, %ymm3
-; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5))
+; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT:    vpsubb %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm5, %zmm3
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm4 & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -985,19 +979,17 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm3, %ymm7, %ymm3
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5))
+; AVX512VL-FALLBACK-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm3, %ymm5, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm3, %zmm5, %zmm3
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm4 & (zmm3 ^ zmm2))
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 2f8cd4d41af54..c9ef6b6c4cdb2 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -4451,8 +4451,8 @@ define i32 @PR39665_c_ray_select(<2 x double> %x, <2 x double> %y) {
 ; KNL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; KNL-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
-; KNL-NEXT:    knotw %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    notb %al
 ; KNL-NEXT:    testb $3, %al
 ; KNL-NEXT:    movl $42, %ecx
 ; KNL-NEXT:    movl $99, %eax
@@ -4463,8 +4463,8 @@ define i32 @PR39665_c_ray_select(<2 x double> %x, <2 x double> %y) {
 ; SKX-LABEL: PR39665_c_ray_select:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
-; SKX-NEXT:    knotw %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    notb %al
 ; SKX-NEXT:    testb $3, %al
 ; SKX-NEXT:    movl $42, %ecx
 ; SKX-NEXT:    movl $99, %eax
diff --git a/llvm/test/CodeGen/X86/pr162812.ll b/llvm/test/CodeGen/X86/pr162812.ll
index cec093c3df743..02703b7e32cc6 100644
--- a/llvm/test/CodeGen/X86/pr162812.ll
+++ b/llvm/test/CodeGen/X86/pr162812.ll
@@ -34,32 +34,47 @@ define <32 x i8> @PR162812(<32 x i8> %a, <32 x i8> %mask) {
 ;
 ; SSE42-LABEL: PR162812:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa %xmm0, %xmm4
-; SSE42-NEXT:    psrlw $2, %xmm2
-; SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [8224,8224,8224,8224,8224,8224,8224,8224]
-; SSE42-NEXT:    pand %xmm5, %xmm2
-; SSE42-NEXT:    paddb %xmm2, %xmm2
-; SSE42-NEXT:    paddb %xmm2, %xmm2
+; SSE42-NEXT:    movdqa %xmm2, %xmm5
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm0, %xmm6
-; SSE42-NEXT:    paddb %xmm0, %xmm6
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    pblendvb %xmm0, %xmm6, %xmm4
-; SSE42-NEXT:    psrlw $2, %xmm3
-; SSE42-NEXT:    pand %xmm3, %xmm5
+; SSE42-NEXT:    psllw $4, %xmm6
+; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; SSE42-NEXT:    pand %xmm7, %xmm6
+; SSE42-NEXT:    psrlw $2, %xmm5
+; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [8224,8224,8224,8224,8224,8224,8224,8224]
+; SSE42-NEXT:    pand %xmm4, %xmm5
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
+; SSE42-NEXT:    movdqa %xmm2, %xmm6
+; SSE42-NEXT:    paddb %xmm2, %xmm6
 ; SSE42-NEXT:    paddb %xmm5, %xmm5
 ; SSE42-NEXT:    paddb %xmm5, %xmm5
-; SSE42-NEXT:    movdqa %xmm1, %xmm2
-; SSE42-NEXT:    paddb %xmm1, %xmm2
 ; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE42-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
+; SSE42-NEXT:    movdqa %xmm1, %xmm5
+; SSE42-NEXT:    psllw $4, %xmm5
+; SSE42-NEXT:    pand %xmm7, %xmm5
+; SSE42-NEXT:    psrlw $2, %xmm3
+; SSE42-NEXT:    pand %xmm3, %xmm4
+; SSE42-NEXT:    movdqa %xmm4, %xmm0
+; SSE42-NEXT:    pblendvb %xmm0, %xmm5, %xmm1
+; SSE42-NEXT:    movdqa %xmm1, %xmm3
+; SSE42-NEXT:    paddb %xmm1, %xmm3
+; SSE42-NEXT:    paddb %xmm4, %xmm4
+; SSE42-NEXT:    paddb %xmm4, %xmm4
 ; SSE42-NEXT:    movdqa %xmm4, %xmm0
+; SSE42-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
+; SSE42-NEXT:    movdqa %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX2-LABEL: PR162812:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX2-NEXT:    vpsrlw $2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
@@ -67,9 +82,12 @@ define <32 x i8> @PR162812(<32 x i8> %a, <32 x i8> %mask) {
 ;
 ; AVX512-LABEL: PR162812:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
 ; AVX512-NEXT:    vpsrlw $2, %ymm1, %ymm1
 ; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
+; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
 ; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
index b8e83da9cf361..ebb5e135eacd0 100644
--- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
@@ -281,7 +281,7 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X64-AVX2-NEXT:    vpsllvd %ymm1, %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; X64-AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
 ; X64-AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 ; X64-AVX2-NEXT:    vpsrlvd %ymm1, %ymm3, %ymm1
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index fce879585289a..ebd57cb941552 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -501,39 +501,39 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
 ; SSE3-NEXT:    pextrw $0, %xmm1, %eax
 ; SSE3-NEXT:    pextrw $1, %xmm1, %ecx
 ; SSE3-NEXT:    pextrw $2, %xmm1, %edx
-; SSE3-NEXT:    pextrw $3, %xmm1, %edi
-; SSE3-NEXT:    pextrw $4, %xmm1, %r8d
-; SSE3-NEXT:    pextrw $5, %xmm1, %r9d
-; SSE3-NEXT:    pextrw $6, %xmm1, %r10d
-; SSE3-NEXT:    pextrw $7, %xmm1, %esi
+; SSE3-NEXT:    pextrw $3, %xmm1, %esi
+; SSE3-NEXT:    pextrw $4, %xmm1, %edi
+; SSE3-NEXT:    pextrw $5, %xmm1, %r8d
+; SSE3-NEXT:    pextrw $6, %xmm1, %r9d
+; SSE3-NEXT:    pextrw $7, %xmm1, %r10d
 ; SSE3-NEXT:    movdqa %xmm2, -24(%rsp)
 ; SSE3-NEXT:    andl $7, %eax
-; SSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; SSE3-NEXT:    andl $7, %ecx
-; SSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
 ; SSE3-NEXT:    andl $7, %edx
-; SSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
+; SSE3-NEXT:    andl $7, %esi
 ; SSE3-NEXT:    andl $7, %edi
-; SSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
 ; SSE3-NEXT:    andl $7, %r8d
-; SSE3-NEXT:    movzwl -24(%rsp,%r8,2), %r8d
 ; SSE3-NEXT:    andl $7, %r9d
-; SSE3-NEXT:    movzwl -24(%rsp,%r9,2), %r9d
 ; SSE3-NEXT:    andl $7, %r10d
 ; SSE3-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
-; SSE3-NEXT:    andl $7, %esi
-; SSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; SSE3-NEXT:    movd %esi, %xmm1
-; SSE3-NEXT:    movd %r10d, %xmm2
+; SSE3-NEXT:    movd %r10d, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%r9,2), %r9d
+; SSE3-NEXT:    movd %r9d, %xmm2
 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE3-NEXT:    movd %r9d, %xmm1
-; SSE3-NEXT:    movd %r8d, %xmm3
+; SSE3-NEXT:    movzwl -24(%rsp,%r8,2), %r8d
+; SSE3-NEXT:    movd %r8d, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
+; SSE3-NEXT:    movd %edi, %xmm3
 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE3-NEXT:    movd %edi, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
+; SSE3-NEXT:    movd %esi, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
 ; SSE3-NEXT:    movd %edx, %xmm2
 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
 ; SSE3-NEXT:    movd %ecx, %xmm1
+; SSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; SSE3-NEXT:    movd %eax, %xmm4
 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
@@ -1102,9 +1102,8 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices)
 ; SSE3-NEXT:    movq %xmm1, %rcx
 ; SSE3-NEXT:    andl $1, %ecx
 ; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
-; SSE3-NEXT:    movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
-; SSE3-NEXT:    movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
-; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSE3-NEXT:    movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
 ; SSE3-NEXT:    pandn %xmm0, %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSE3-NEXT:    retq
@@ -1127,9 +1126,8 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices)
 ; SSSE3-NEXT:    movq %xmm1, %rcx
 ; SSSE3-NEXT:    andl $1, %ecx
 ; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
-; SSSE3-NEXT:    movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
-; SSSE3-NEXT:    movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSSE3-NEXT:    movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
 ; SSSE3-NEXT:    pandn %xmm0, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
@@ -1302,16 +1300,16 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
 ; SSE3-NEXT:    movd %xmm1, %esi
 ; SSE3-NEXT:    movaps %xmm2, -24(%rsp)
 ; SSE3-NEXT:    andl $3, %eax
-; SSE3-NEXT:    movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
 ; SSE3-NEXT:    andl $3, %ecx
-; SSE3-NEXT:    movd -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
 ; SSE3-NEXT:    andl $3, %edx
-; SSE3-NEXT:    movd -24(%rsp,%rdx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero
 ; SSE3-NEXT:    andl $3, %esi
-; SSE3-NEXT:    movd -24(%rsp,%rsi,4), %xmm4 # xmm4 = mem[0],zero,zero,zero
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE3-NEXT:    movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE3-NEXT:    movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
@@ -1329,8 +1327,9 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSSE3-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSSE3-NEXT:    por %xmm2, %xmm1
 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shuffle_zero_v4f32:
@@ -1341,8 +1340,9 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
 ; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
 ; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pandn %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; XOP-LABEL: var_shuffle_zero_v4f32:
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index 1a6351524ffbd..5af992c2d05dd 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -289,7 +289,8 @@ define <8 x i32> @test_compress_v8i32(<8 x i32> %vec, <8 x i1> %mask, <8 x i32>
 ; AVX2-NEXT:    cmpq $8, %r11
 ; AVX2-NEXT:    cmovbl (%rsp,%rax,4), %ebx
 ; AVX2-NEXT:    vmovss %xmm0, (%rsp)
-; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rax,4)
 ; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
 ; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rsi,4)
 ; AVX2-NEXT:    andl $7, %edi
@@ -363,7 +364,8 @@ define <8 x float> @test_compress_v8f32(<8 x float> %vec, <8 x i1> %mask, <8 x f
 ; AVX2-NEXT:    vmovss %xmm0, (%rsp)
 ; AVX2-NEXT:    vmovd %xmm3, %eax
 ; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rax,4)
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rcx,4)
 ; AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addq %rax, %rcx
@@ -1093,15 +1095,15 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm3, %xmm1
 ; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vpextrb $1, %xmm1, %r13d
+; AVX2-NEXT:    vpextrb $1, %xmm1, %ebp
 ; AVX2-NEXT:    vmovd %xmm1, %esi
 ; AVX2-NEXT:    movl %esi, %eax
 ; AVX2-NEXT:    andb $1, %al
-; AVX2-NEXT:    subb %r13b, %al
+; AVX2-NEXT:    subb %bpl, %al
 ; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
 ; AVX2-NEXT:    subb %dl, %al
-; AVX2-NEXT:    vpextrb $3, %xmm1, %ebp
-; AVX2-NEXT:    subb %bpl, %al
+; AVX2-NEXT:    vpextrb $3, %xmm1, %r13d
+; AVX2-NEXT:    subb %r13b, %al
 ; AVX2-NEXT:    vpextrb $4, %xmm1, %r12d
 ; AVX2-NEXT:    subb %r12b, %al
 ; AVX2-NEXT:    vpextrb $5, %xmm1, %r15d
@@ -1135,17 +1137,17 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
 ; AVX2-NEXT:    vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    andl $1, %esi
 ; AVX2-NEXT:    vpextrb $1, %xmm0, -40(%rsp,%rsi)
-; AVX2-NEXT:    andl $1, %r13d
-; AVX2-NEXT:    addq %rsi, %r13
-; AVX2-NEXT:    vpextrb $2, %xmm0, -40(%rsp,%r13)
+; AVX2-NEXT:    andl $1, %ebp
+; AVX2-NEXT:    addq %rsi, %rbp
+; AVX2-NEXT:    vpextrb $2, %xmm0, -40(%rsp,%rbp)
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %r13, %rdx
+; AVX2-NEXT:    addq %rbp, %rdx
 ; AVX2-NEXT:    vpextrb $3, %xmm0, -40(%rsp,%rdx)
-; AVX2-NEXT:    andl $1, %ebp
-; AVX2-NEXT:    addq %rdx, %rbp
-; AVX2-NEXT:    vpextrb $4, %xmm0, -40(%rsp,%rbp)
+; AVX2-NEXT:    andl $1, %r13d
+; AVX2-NEXT:    addq %rdx, %r13
+; AVX2-NEXT:    vpextrb $4, %xmm0, -40(%rsp,%r13)
 ; AVX2-NEXT:    andl $1, %r12d
-; AVX2-NEXT:    addq %rbp, %r12
+; AVX2-NEXT:    addq %r13, %r12
 ; AVX2-NEXT:    andl $1, %r15d
 ; AVX2-NEXT:    addq %r12, %r15
 ; AVX2-NEXT:    # kill: def $r12d killed $r12d killed $r12 def $r12
@@ -1693,30 +1695,30 @@ define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x
 ; AVX2-NEXT:    vpextrw $4, %xmm1, %r13d
 ; AVX2-NEXT:    andl $1, %r13d
 ; AVX2-NEXT:    addq %r12, %r13
-; AVX2-NEXT:    vpextrw $5, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    addq %r13, %rdx
-; AVX2-NEXT:    vpextrw $6, %xmm1, %ecx
+; AVX2-NEXT:    vpextrw $5, %xmm1, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rdx, %rcx
-; AVX2-NEXT:    vpextrw $7, %xmm1, %edi
-; AVX2-NEXT:    andl $1, %edi
-; AVX2-NEXT:    addq %rcx, %rdi
+; AVX2-NEXT:    addq %r13, %rcx
+; AVX2-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %rcx, %rax
+; AVX2-NEXT:    vpextrw $7, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rax, %rdx
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    cmpq $16, %rdi
-; AVX2-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT:    cmovbw (%rsp,%rsi,2), %ax
-; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    cmpq $16, %rdx
+; AVX2-NEXT:    vpextrw $7, %xmm1, %esi
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX2-NEXT:    cmovbw (%rsp,%rdi,2), %si
+; AVX2-NEXT:    movl %esi, %edi
 ; AVX2-NEXT:    vpextrw $0, %xmm0, (%rsp)
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
 ; AVX2-NEXT:    vpextrw $1, %xmm0, (%rsp,%rsi,2)
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; AVX2-NEXT:    vpextrw $2, %xmm0, (%rsp,%rsi,2)
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; AVX2-NEXT:    vpextrw $3, %xmm0, (%rsp,%rsi,2)
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    vpextrw $4, %xmm0, (%rsp,%rax,2)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    vpextrw $4, %xmm0, (%rsp,%rsi,2)
 ; AVX2-NEXT:    andl $15, %r8d
 ; AVX2-NEXT:    vpextrw $5, %xmm0, (%rsp,%r8,2)
 ; AVX2-NEXT:    andl $15, %r9d
@@ -1735,16 +1737,15 @@ define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x
 ; AVX2-NEXT:    vpextrw $4, %xmm1, (%rsp,%r12,2)
 ; AVX2-NEXT:    andl $15, %r13d
 ; AVX2-NEXT:    vpextrw $5, %xmm1, (%rsp,%r13,2)
-; AVX2-NEXT:    andl $15, %edx
-; AVX2-NEXT:    vpextrw $6, %xmm1, (%rsp,%rdx,2)
 ; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpextrw $7, %xmm1, (%rsp,%rcx,2)
-; AVX2-NEXT:    cmpq $15, %rdi
+; AVX2-NEXT:    vpextrw $6, %xmm1, (%rsp,%rcx,2)
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpextrw $7, %xmm1, (%rsp,%rax,2)
+; AVX2-NEXT:    cmpq $15, %rdx
 ; AVX2-NEXT:    movl $15, %eax
-; AVX2-NEXT:    cmovbq %rdi, %rax
+; AVX2-NEXT:    cmovbq %rdx, %rax
 ; AVX2-NEXT:    movl %eax, %eax
-; AVX2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; AVX2-NEXT:    movw %cx, (%rsp,%rax,2)
+; AVX2-NEXT:    movw %di, (%rsp,%rax,2)
 ; AVX2-NEXT:    vmovaps (%rsp), %ymm0
 ; AVX2-NEXT:    leaq -40(%rbp), %rsp
 ; AVX2-NEXT:    popq %rbx
@@ -1788,135 +1789,141 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
 ; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
 ; AVX2-NEXT:    andq $-32, %rsp
-; AVX2-NEXT:    subq $96, %rsp
-; AVX2-NEXT:    movl %r9d, %r11d
-; AVX2-NEXT:    movl %r8d, %r10d
-; AVX2-NEXT:    movl %ecx, %r9d
-; AVX2-NEXT:    movl %edx, %r8d
+; AVX2-NEXT:    subq $160, %rsp
+; AVX2-NEXT:    # kill: def $r9d killed $r9d def $r9
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    # kill: def $r8d killed $r8d def $r8
+; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
+; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT:    movzbl 360(%rbp), %eax
-; AVX2-NEXT:    movzbl 352(%rbp), %ecx
-; AVX2-NEXT:    vmovd %ecx, %xmm4
+; AVX2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movl 360(%rbp), %eax
+; AVX2-NEXT:    movl 352(%rbp), %r10d
+; AVX2-NEXT:    vmovd %r10d, %xmm4
 ; AVX2-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 368(%rbp), %eax
+; AVX2-NEXT:    movl 368(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 376(%rbp), %eax
+; AVX2-NEXT:    movl 376(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 384(%rbp), %eax
+; AVX2-NEXT:    movl 384(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 392(%rbp), %eax
+; AVX2-NEXT:    movl 392(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 400(%rbp), %eax
+; AVX2-NEXT:    movl 400(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 408(%rbp), %eax
+; AVX2-NEXT:    movl 408(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 416(%rbp), %eax
+; AVX2-NEXT:    movl 416(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 424(%rbp), %eax
+; AVX2-NEXT:    movl 424(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 432(%rbp), %eax
+; AVX2-NEXT:    movl 432(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 440(%rbp), %eax
+; AVX2-NEXT:    movl 440(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 448(%rbp), %eax
+; AVX2-NEXT:    movl 448(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 456(%rbp), %eax
+; AVX2-NEXT:    movl 456(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 464(%rbp), %eax
+; AVX2-NEXT:    movl 464(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 472(%rbp), %eax
+; AVX2-NEXT:    movl 472(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
-; AVX2-NEXT:    movzbl 224(%rbp), %eax
+; AVX2-NEXT:    movl 224(%rbp), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm5
-; AVX2-NEXT:    movzbl 232(%rbp), %eax
+; AVX2-NEXT:    movl 232(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 240(%rbp), %eax
+; AVX2-NEXT:    movl 240(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 248(%rbp), %eax
+; AVX2-NEXT:    movl 248(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 256(%rbp), %eax
+; AVX2-NEXT:    movl 256(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 264(%rbp), %eax
+; AVX2-NEXT:    movl 264(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 272(%rbp), %eax
+; AVX2-NEXT:    movl 272(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 280(%rbp), %eax
+; AVX2-NEXT:    movl 280(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 288(%rbp), %eax
+; AVX2-NEXT:    movl 288(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 296(%rbp), %eax
+; AVX2-NEXT:    movl 296(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 304(%rbp), %eax
+; AVX2-NEXT:    movl 304(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 312(%rbp), %eax
+; AVX2-NEXT:    movl 312(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 320(%rbp), %eax
+; AVX2-NEXT:    movl 320(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 328(%rbp), %eax
+; AVX2-NEXT:    movl 328(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 336(%rbp), %eax
+; AVX2-NEXT:    movl 336(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 344(%rbp), %eax
+; AVX2-NEXT:    movl 344(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
 ; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
 ; AVX2-NEXT:    vmovd %edi, %xmm5
 ; AVX2-NEXT:    vpinsrb $1, %esi, %xmm5, %xmm5
 ; AVX2-NEXT:    vpinsrb $2, %edx, %xmm5, %xmm5
-; AVX2-NEXT:    vpinsrb $3, %r9d, %xmm5, %xmm5
-; AVX2-NEXT:    vpinsrb $4, %r10d, %xmm5, %xmm5
-; AVX2-NEXT:    vpinsrb $5, %r11d, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 16(%rbp), %ebx
-; AVX2-NEXT:    vpinsrb $6, %ebx, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 24(%rbp), %r14d
-; AVX2-NEXT:    vpinsrb $7, %r14d, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 32(%rbp), %r15d
-; AVX2-NEXT:    vpinsrb $8, %r15d, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 40(%rbp), %r12d
-; AVX2-NEXT:    vpinsrb $9, %r12d, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 48(%rbp), %r13d
-; AVX2-NEXT:    vpinsrb $10, %r13d, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 56(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 64(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 72(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 80(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 88(%rbp), %eax
-; AVX2-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX2-NEXT:    movzbl 96(%rbp), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm6
-; AVX2-NEXT:    movzbl 104(%rbp), %eax
+; AVX2-NEXT:    vpinsrb $3, %ecx, %xmm5, %xmm5
+; AVX2-NEXT:    vpinsrb $4, %r8d, %xmm5, %xmm5
+; AVX2-NEXT:    vpinsrb $5, %r9d, %xmm5, %xmm5
+; AVX2-NEXT:    movl 16(%rbp), %esi
+; AVX2-NEXT:    vpinsrb $6, %esi, %xmm5, %xmm5
+; AVX2-NEXT:    movl 24(%rbp), %edi
+; AVX2-NEXT:    vpinsrb $7, %edi, %xmm5, %xmm5
+; AVX2-NEXT:    movl 32(%rbp), %r8d
+; AVX2-NEXT:    vpinsrb $8, %r8d, %xmm5, %xmm5
+; AVX2-NEXT:    movl 40(%rbp), %r9d
+; AVX2-NEXT:    vpinsrb $9, %r9d, %xmm5, %xmm5
+; AVX2-NEXT:    movl 48(%rbp), %r10d
+; AVX2-NEXT:    vpinsrb $10, %r10d, %xmm5, %xmm5
+; AVX2-NEXT:    movl 56(%rbp), %r11d
+; AVX2-NEXT:    vpinsrb $11, %r11d, %xmm5, %xmm5
+; AVX2-NEXT:    movl 64(%rbp), %ebx
+; AVX2-NEXT:    vpinsrb $12, %ebx, %xmm5, %xmm5
+; AVX2-NEXT:    movl 72(%rbp), %r14d
+; AVX2-NEXT:    vpinsrb $13, %r14d, %xmm5, %xmm5
+; AVX2-NEXT:    movl 80(%rbp), %r15d
+; AVX2-NEXT:    vpinsrb $14, %r15d, %xmm5, %xmm5
+; AVX2-NEXT:    movl 88(%rbp), %r12d
+; AVX2-NEXT:    vpinsrb $15, %r12d, %xmm5, %xmm5
+; AVX2-NEXT:    movl 96(%rbp), %r13d
+; AVX2-NEXT:    vmovd %r13d, %xmm6
+; AVX2-NEXT:    movl 104(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $1, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 112(%rbp), %eax
+; AVX2-NEXT:    movl 112(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $2, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 120(%rbp), %eax
+; AVX2-NEXT:    movl 120(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $3, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 128(%rbp), %eax
+; AVX2-NEXT:    movl 128(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $4, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 136(%rbp), %eax
+; AVX2-NEXT:    movl 136(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $5, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 144(%rbp), %eax
+; AVX2-NEXT:    movl 144(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $6, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 152(%rbp), %eax
+; AVX2-NEXT:    movl 152(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $7, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 160(%rbp), %eax
+; AVX2-NEXT:    movl 160(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $8, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 168(%rbp), %eax
+; AVX2-NEXT:    movl 168(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $9, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 176(%rbp), %eax
+; AVX2-NEXT:    movl 176(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $10, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 184(%rbp), %eax
+; AVX2-NEXT:    movl 184(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $11, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 192(%rbp), %eax
+; AVX2-NEXT:    movl 192(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $12, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 200(%rbp), %eax
+; AVX2-NEXT:    movl 200(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $13, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 208(%rbp), %eax
+; AVX2-NEXT:    movl 208(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm6, %xmm6
-; AVX2-NEXT:    movzbl 216(%rbp), %eax
+; AVX2-NEXT:    movl 216(%rbp), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm6, %xmm6
 ; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
 ; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
@@ -1960,435 +1967,382 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
 ; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
 ; AVX2-NEXT:    movzbl %al, %eax
 ; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    movzbl (%rsp,%rax), %edx
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
+; AVX2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp)
-; AVX2-NEXT:    andl $1, %edi
-; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rdi)
-; AVX2-NEXT:    andl $1, %esi
-; AVX2-NEXT:    addq %rdi, %rsi
-; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rsi)
-; AVX2-NEXT:    andl $1, %r8d
-; AVX2-NEXT:    addq %rsi, %r8
-; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%r8)
-; AVX2-NEXT:    andl $1, %r9d
-; AVX2-NEXT:    addq %r8, %r9
-; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%r9)
-; AVX2-NEXT:    andl $1, %r10d
-; AVX2-NEXT:    addq %r9, %r10
-; AVX2-NEXT:    movl %r10d, %eax
-; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    andl $1, %r11d
-; AVX2-NEXT:    addq %r10, %r11
-; AVX2-NEXT:    movzbl %bl, %eax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %r11, %rax
-; AVX2-NEXT:    # kill: def $r11d killed $r11d killed $r11 def $r11
-; AVX2-NEXT:    andl $63, %r11d
-; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%r11)
-; AVX2-NEXT:    movzbl %r14b, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl %r15b, %eax
+; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl %r12b, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl %r13b, %eax
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 56(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 64(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 72(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 80(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 88(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    addq %rax, %rsi
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 96(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    andl $1, %edi
+; AVX2-NEXT:    addq %rsi, %rdi
+; AVX2-NEXT:    # kill: def $esi killed $esi killed $rsi def $rsi
+; AVX2-NEXT:    andl $63, %esi
+; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rsi)
+; AVX2-NEXT:    andl $1, %r8d
+; AVX2-NEXT:    addq %rdi, %r8
+; AVX2-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX2-NEXT:    andl $63, %edi
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rdi)
+; AVX2-NEXT:    andl $1, %r9d
+; AVX2-NEXT:    addq %r8, %r9
+; AVX2-NEXT:    # kill: def $r8d killed $r8d killed $r8 def $r8
+; AVX2-NEXT:    andl $63, %r8d
+; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%r8)
+; AVX2-NEXT:    andl $1, %r10d
+; AVX2-NEXT:    addq %r9, %r10
+; AVX2-NEXT:    # kill: def $r9d killed $r9d killed $r9 def $r9
+; AVX2-NEXT:    andl $63, %r9d
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%r9)
+; AVX2-NEXT:    andl $1, %r11d
+; AVX2-NEXT:    addq %r10, %r11
+; AVX2-NEXT:    # kill: def $r10d killed $r10d killed $r10 def $r10
+; AVX2-NEXT:    andl $63, %r10d
+; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%r10)
+; AVX2-NEXT:    andl $1, %ebx
+; AVX2-NEXT:    addq %r11, %rbx
+; AVX2-NEXT:    # kill: def $r11d killed $r11d killed $r11 def $r11
+; AVX2-NEXT:    andl $63, %r11d
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%r11)
+; AVX2-NEXT:    andl $1, %r14d
+; AVX2-NEXT:    addq %rbx, %r14
+; AVX2-NEXT:    # kill: def $ebx killed $ebx killed $rbx def $rbx
+; AVX2-NEXT:    andl $63, %ebx
+; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rbx)
+; AVX2-NEXT:    andl $1, %r15d
+; AVX2-NEXT:    addq %r14, %r15
+; AVX2-NEXT:    # kill: def $r14d killed $r14d killed $r14 def $r14
+; AVX2-NEXT:    andl $63, %r14d
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%r14)
+; AVX2-NEXT:    andl $1, %r12d
+; AVX2-NEXT:    addq %r15, %r12
+; AVX2-NEXT:    # kill: def $r15d killed $r15d killed $r15 def $r15
+; AVX2-NEXT:    andl $63, %r15d
+; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%r15)
+; AVX2-NEXT:    andl $1, %r13d
+; AVX2-NEXT:    addq %r12, %r13
+; AVX2-NEXT:    # kill: def $r12d killed $r12d killed $r12 def $r12
+; AVX2-NEXT:    andl $63, %r12d
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 104(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%r12)
+; AVX2-NEXT:    movl 104(%rbp), %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    addq %r13, %rax
+; AVX2-NEXT:    # kill: def $r13d killed $r13d killed $r13 def $r13
+; AVX2-NEXT:    andl $63, %r13d
+; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%r13)
+; AVX2-NEXT:    movl 112(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rax, %rdx
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
 ; AVX2-NEXT:    andl $63, %eax
-; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 112(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 120(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 120(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 128(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 136(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 128(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 136(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 144(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 152(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 144(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 152(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 160(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 168(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 160(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 168(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 176(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 184(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 176(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 184(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 192(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 200(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 192(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 200(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 208(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 216(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 208(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 216(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 224(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $0, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 232(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 224(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $0, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 232(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $1, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 240(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 248(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 240(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $2, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 248(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $3, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 256(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 264(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    movl 256(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $4, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 264(%rbp), %ecx
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $5, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 272(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 280(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 272(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $6, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 280(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $7, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 288(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $8, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 296(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 288(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $8, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 296(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $9, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 304(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 312(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 304(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $10, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 312(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $11, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 320(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $12, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 328(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 320(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 328(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $13, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 336(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 344(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 336(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $14, %xmm1, (%rsp,%rax)
+; AVX2-NEXT:    movl 344(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $15, %xmm1, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 352(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    movl 352(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 360(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    vpextrb $0, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 360(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 368(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 376(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 368(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $2, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 376(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 384(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 392(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 384(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $4, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 392(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 400(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 408(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 400(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 408(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 416(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 424(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 416(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $8, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 424(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 432(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 440(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 432(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $10, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 440(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 448(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 456(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 448(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $12, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 456(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT:    movzbl 464(%rbp), %eax
-; AVX2-NEXT:    movzbl %al, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT:    movzbl 472(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 464(%rbp), %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    addq %rcx, %rdx
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $63, %eax
+; AVX2-NEXT:    vpextrb $14, %xmm0, (%rsp,%rax)
+; AVX2-NEXT:    movl 472(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT:    addq %rdx, %rcx
+; AVX2-NEXT:    movl %edx, %eax
 ; AVX2-NEXT:    andl $63, %eax
 ; AVX2-NEXT:    vpextrb $15, %xmm0, (%rsp,%rax)
 ; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
 ; AVX2-NEXT:    cmpq $64, %rcx
-; AVX2-NEXT:    cmovbl %edx, %eax
+; AVX2-NEXT:    cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
 ; AVX2-NEXT:    cmpq $63, %rcx
-; AVX2-NEXT:    movl $63, %edx
-; AVX2-NEXT:    cmovbq %rcx, %rdx
-; AVX2-NEXT:    movb %al, (%rsp,%rdx)
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    movl $63, %ecx
+; AVX2-NEXT:    cmovbq %rdx, %rcx
+; AVX2-NEXT:    movb %al, (%rsp,%rcx)
 ; AVX2-NEXT:    vmovaps (%rsp), %ymm0
 ; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
 ; AVX2-NEXT:    leaq -40(%rbp), %rsp
@@ -3323,10 +3277,10 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
 ; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $288, %rsp # imm = 0x120
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
 ; AVX2-NEXT:    # kill: def $r9d killed $r9d def $r9
 ; AVX2-NEXT:    # kill: def $r8d killed $r8d def $r8
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
 ; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX2-NEXT:    movq %rdi, %rax
 ; AVX2-NEXT:    vmovss %xmm0, (%rsp)
@@ -3344,413 +3298,355 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
 ; AVX2-NEXT:    vmovss %xmm0, (%rsp,%r8,4)
 ; AVX2-NEXT:    andl $1, %r9d
 ; AVX2-NEXT:    addl %r8d, %r9d
-; AVX2-NEXT:    movzbl 16(%rbp), %ecx
 ; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%r9,4)
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 16(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %r9d, %ecx
-; AVX2-NEXT:    movzbl 24(%rbp), %edx
 ; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 24(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    movzbl 32(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 32(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 40(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 40(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vmovss %xmm1, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 48(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 48(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm1, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 56(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 56(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm1, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 64(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 64(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm1, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 72(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 72(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 80(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 80(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 88(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 88(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 96(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 96(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 104(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 104(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vmovss %xmm2, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 112(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 112(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm2, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 120(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 120(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm2, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 128(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 128(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm2, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 136(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 136(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm0
 ; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 144(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 144(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 152(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 152(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 160(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 160(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 168(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 168(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vmovss %xmm3, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 176(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 176(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm3, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 184(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 184(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm3, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 192(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 192(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm3, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 200(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 200(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm0
 ; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 208(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 208(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 216(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 216(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 224(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 224(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 232(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 232(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vmovss %xmm4, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 240(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 240(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm4, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 248(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 248(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm4, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 256(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 256(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm4, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 264(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 264(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractf128 $1, %ymm4, %xmm0
 ; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 272(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 272(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 280(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 280(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 288(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 288(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 296(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 296(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vmovss %xmm5, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 304(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 304(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm5, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 312(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 312(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm5, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 320(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 320(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm5, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 328(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 328(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractf128 $1, %ymm5, %xmm0
 ; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 336(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 336(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 344(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 344(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 352(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 352(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 360(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 360(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vmovss %xmm6, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 368(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 368(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm6, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 376(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 376(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm6, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 384(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 384(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm6, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 392(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 392(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractf128 $1, %ymm6, %xmm0
 ; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 400(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 400(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 408(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 408(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 416(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 416(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 424(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 424(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vmovss %xmm7, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 432(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 432(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm7, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 440(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 440(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm7, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 448(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 448(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $3, %xmm7, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 456(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 456(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractf128 $1, %ymm7, %xmm0
 ; AVX2-NEXT:    vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT:    movzbl 464(%rbp), %ecx
-; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    movl 464(%rbp), %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addl %edx, %ecx
-; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT:    # kill: def $edx killed $edx killed $rdx def $rdx
 ; AVX2-NEXT:    andl $63, %edx
 ; AVX2-NEXT:    vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT:    movzbl 472(%rbp), %edx
-; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    movl 472(%rbp), %edx
 ; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    addl %ecx, %edx
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX2-NEXT:    andl $63, %ecx
 ; AVX2-NEXT:    vextractps $2, %xmm0, (%rsp,%rcx,4)
 ; AVX2-NEXT:    andl $63, %edx
@@ -4748,6 +4644,17 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind {
 ; AVX2-NEXT:    vpextrb $3, %xmm1, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addq %rax, %rcx
+; AVX2-NEXT:    vpextrb $4, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $5, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $6, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $7, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $8, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $9, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $10, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $11, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $12, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $13, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT:    vpextrb $14, %xmm0, -24(%rsp,%rcx)
 ; AVX2-NEXT:    vpextrb $15, %xmm0, -24(%rsp,%rcx)
 ; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
 ; AVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index fce622a99bb6a..560d5be284f15 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -12,6 +12,9 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzbl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movd %eax, %xmm0
+; X64-NO-BMI2-NEXT:    movd %xmm0, %eax
+; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -21,6 +24,9 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzbl (%rdi), %eax
+; X64-BMI2-NEXT:    movd %eax, %xmm0
+; X64-BMI2-NEXT:    movd %xmm0, %eax
+; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
@@ -28,14 +34,17 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT:    movzbl (%eax), %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NEXT:    movzbl (%edx), %edx
+; X86-NO-BMI2-NEXT:    movd %edx, %xmm0
+; X86-NO-BMI2-NEXT:    movd %xmm0, %edx
+; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NEXT:    movb %al, (%edx)
+; X86-NO-BMI2-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NEXT:    movb %dl, (%eax)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
@@ -44,6 +53,9 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzbl (%edx), %edx
+; X86-BMI2-NEXT:    movd %edx, %xmm0
+; X86-BMI2-NEXT:    movd %xmm0, %edx
+; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movb %cl, (%eax)
@@ -65,6 +77,10 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movd %eax, %xmm0
+; X64-NO-BMI2-NEXT:    pxor %xmm1, %xmm1
+; X64-NO-BMI2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NO-BMI2-NEXT:    movd %xmm0, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -74,6 +90,10 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-BMI2-NEXT:    movd %eax, %xmm0
+; X64-BMI2-NEXT:    pxor %xmm1, %xmm1
+; X64-BMI2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-BMI2-NEXT:    movd %xmm0, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
@@ -81,14 +101,18 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT:    movd %edx, %xmm0
+; X86-NO-BMI2-NEXT:    pxor %xmm1, %xmm1
+; X86-NO-BMI2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NO-BMI2-NEXT:    movd %xmm0, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NEXT:    movb %al, (%edx)
+; X86-NO-BMI2-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NEXT:    movb %dl, (%eax)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -97,6 +121,10 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-BMI2-NEXT:    movd %edx, %xmm0
+; X86-BMI2-NEXT:    pxor %xmm1, %xmm1
+; X86-BMI2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-BMI2-NEXT:    movd %xmm0, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movb %cl, (%eax)
@@ -119,6 +147,10 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movd %eax, %xmm0
+; X64-NO-BMI2-NEXT:    pxor %xmm1, %xmm1
+; X64-NO-BMI2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NO-BMI2-NEXT:    movd %xmm0, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -128,6 +160,10 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-BMI2-NEXT:    movd %eax, %xmm0
+; X64-BMI2-NEXT:    pxor %xmm1, %xmm1
+; X64-BMI2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-BMI2-NEXT:    movd %xmm0, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
@@ -139,6 +175,10 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT:    movd %edx, %xmm0
+; X86-NO-BMI2-NEXT:    pxor %xmm1, %xmm1
+; X86-NO-BMI2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NO-BMI2-NEXT:    movd %xmm0, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NEXT:    shrl %cl, %edx
@@ -151,6 +191,10 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-BMI2-NEXT:    movd %edx, %xmm0
+; X86-BMI2-NEXT:    pxor %xmm1, %xmm1
+; X86-BMI2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-BMI2-NEXT:    movd %xmm0, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movw %cx, (%eax)
@@ -171,8 +215,9 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
@@ -180,8 +225,9 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movl (%rdi), %eax
+; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    movq %xmm0, %rax
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
 ; X64-BMI2-NEXT:    retq
@@ -248,8 +294,9 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
@@ -257,8 +304,9 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movl (%rdi), %eax
+; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    movq %xmm0, %rax
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
 ; X64-BMI2-NEXT:    retq
@@ -324,8 +372,9 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
@@ -333,8 +382,9 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movl (%rdi), %eax
+; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    movq %xmm0, %rax
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movl %eax, (%rdx)
 ; X64-BMI2-NEXT:    retq
@@ -400,38 +450,73 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (%rax,%rax), %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb %sil, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb %al, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-SHLD:       # %bb.0:
-; X64-SHLD-NEXT:    movq %rsi, %rcx
-; X64-SHLD-NEXT:    movq (%rdi), %rax
-; X64-SHLD-NEXT:    shll $3, %ecx
-; X64-SHLD-NEXT:    xorl %esi, %esi
-; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-SHLD-NEXT:    testb $64, %cl
-; X64-SHLD-NEXT:    cmovneq %rsi, %rax
-; X64-SHLD-NEXT:    movb %al, (%rdx)
-; X64-SHLD-NEXT:    retq
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (%rcx,%rcx), %r8d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
@@ -439,12 +524,11 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
@@ -469,12 +553,11 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %edx
 ; X86-SHLD-NEXT:    shrb $3, %dl
 ; X86-SHLD-NEXT:    andb $12, %dl
@@ -495,12 +578,11 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
@@ -532,38 +614,73 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (%rax,%rax), %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movw %ax, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-SHLD:       # %bb.0:
-; X64-SHLD-NEXT:    movq %rsi, %rcx
-; X64-SHLD-NEXT:    movq (%rdi), %rax
-; X64-SHLD-NEXT:    shll $3, %ecx
-; X64-SHLD-NEXT:    xorl %esi, %esi
-; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-SHLD-NEXT:    testb $64, %cl
-; X64-SHLD-NEXT:    cmovneq %rsi, %rax
-; X64-SHLD-NEXT:    movw %ax, (%rdx)
-; X64-SHLD-NEXT:    retq
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (%rcx,%rcx), %r8d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
@@ -571,12 +688,11 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
@@ -601,12 +717,11 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %edx
 ; X86-SHLD-NEXT:    shrb $3, %dl
 ; X86-SHLD-NEXT:    andb $12, %dl
@@ -627,12 +742,11 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
@@ -663,38 +777,73 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (%rax,%rax), %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-SHLD:       # %bb.0:
-; X64-SHLD-NEXT:    movq %rsi, %rcx
-; X64-SHLD-NEXT:    movq (%rdi), %rax
-; X64-SHLD-NEXT:    shll $3, %ecx
-; X64-SHLD-NEXT:    xorl %esi, %esi
-; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-SHLD-NEXT:    testb $64, %cl
-; X64-SHLD-NEXT:    cmovneq %rsi, %rax
-; X64-SHLD-NEXT:    movl %eax, (%rdx)
-; X64-SHLD-NEXT:    retq
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (%rcx,%rcx), %r8d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
@@ -702,12 +851,11 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
@@ -732,12 +880,11 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %edx
 ; X86-SHLD-NEXT:    shrb $3, %dl
 ; X86-SHLD-NEXT:    andb $12, %dl
@@ -758,12 +905,11 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
@@ -794,38 +940,73 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-SHLD:       # %bb.0:
-; X64-SHLD-NEXT:    movq %rsi, %rcx
-; X64-SHLD-NEXT:    movq (%rdi), %rax
-; X64-SHLD-NEXT:    shll $3, %ecx
-; X64-SHLD-NEXT:    xorl %esi, %esi
-; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-SHLD-NEXT:    testb $64, %cl
-; X64-SHLD-NEXT:    cmovneq %rsi, %rax
-; X64-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-SHLD-NEXT:    retq
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
@@ -836,12 +1017,11 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
@@ -881,12 +1061,11 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %edx
 ; X86-SHLD-NEXT:    shrb $3, %dl
 ; X86-SHLD-NEXT:    andb $12, %dl
@@ -916,12 +1095,11 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
@@ -964,13 +1142,13 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT:    xorps %xmm0, %xmm0
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-NEXT:    movzbl %al, %eax
@@ -982,13 +1160,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-BMI2-NEXT:    xorps %xmm0, %xmm0
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movups (%rdi), %xmm1
 ; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    movl %esi, %eax
 ; X64-BMI2-NEXT:    shrb $6, %al
 ; X64-BMI2-NEXT:    movzbl %al, %eax
@@ -1003,13 +1181,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
@@ -1033,13 +1211,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %edx
 ; X86-SHLD-NEXT:    shrb $5, %dl
 ; X86-SHLD-NEXT:    movzbl %dl, %edx
@@ -1059,13 +1237,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
@@ -1096,13 +1274,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT:    xorps %xmm0, %xmm0
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-NEXT:    movzbl %al, %eax
@@ -1120,13 +1298,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-BMI2-NEXT:    xorps %xmm0, %xmm0
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movups (%rdi), %xmm1
 ; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    movl %esi, %eax
 ; X64-BMI2-NEXT:    shrb $6, %al
 ; X64-BMI2-NEXT:    movzbl %al, %eax
@@ -1148,13 +1326,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
@@ -1178,13 +1356,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %edx
 ; X86-SHLD-NEXT:    shrb $5, %dl
 ; X86-SHLD-NEXT:    movzbl %dl, %edx
@@ -1204,13 +1382,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
@@ -1240,13 +1418,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT:    xorps %xmm0, %xmm0
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-NEXT:    movzbl %al, %eax
@@ -1264,13 +1442,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-BMI2-NEXT:    xorps %xmm1, %xmm1
+; X64-BMI2-NEXT:    xorps %xmm0, %xmm0
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movups (%rdi), %xmm1
 ; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    movl %esi, %eax
 ; X64-BMI2-NEXT:    shrb $6, %al
 ; X64-BMI2-NEXT:    movzbl %al, %eax
@@ -1292,13 +1470,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
@@ -1322,13 +1500,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %edx
 ; X86-SHLD-NEXT:    shrb $5, %dl
 ; X86-SHLD-NEXT:    movzbl %dl, %edx
@@ -1348,13 +1526,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
@@ -1384,13 +1562,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %al
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
@@ -1407,13 +1585,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-SHLD:       # %bb.0:
-; X64-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
-; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movups (%rdi), %xmm1
 ; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-SHLD-NEXT:    movl %ecx, %eax
 ; X64-SHLD-NEXT:    shrb $6, %al
 ; X64-SHLD-NEXT:    movzbl %al, %eax
@@ -1426,13 +1604,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
@@ -1455,13 +1633,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
@@ -1500,13 +1678,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %edx
 ; X86-SHLD-NEXT:    shrb $5, %dl
 ; X86-SHLD-NEXT:    movzbl %dl, %edx
@@ -1535,13 +1713,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
@@ -1583,13 +1761,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm1
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
@@ -1616,13 +1794,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %cl
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %esi
@@ -1644,13 +1822,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm1
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
@@ -1673,15 +1851,15 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm1
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
@@ -1707,13 +1885,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
@@ -1773,13 +1951,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-SHLD-NEXT:    subl $92, %esp
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movups (%eax), %xmm0
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm1
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %eax
 ; X86-SHLD-NEXT:    shrb $5, %al
 ; X86-SHLD-NEXT:    movzbl %al, %ebx
@@ -1816,13 +1994,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
@@ -1881,17 +2059,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    pushq %rax
-; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    andl $56, %ecx
 ; X64-NO-BMI2-NEXT:    andl $56, %esi
@@ -1910,17 +2088,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    pushq %rax
-; X64-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    xorps %xmm0, %xmm0
+; X64-BMI2-NEXT:    movups (%rdi), %xmm1
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm2
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-BMI2-NEXT:    andl $56, %eax
 ; X64-BMI2-NEXT:    andl $56, %esi
@@ -1942,17 +2120,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
@@ -1975,17 +2153,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-SHLD-NEXT:    movups 16(%ecx), %xmm2
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
 ; X86-SHLD-NEXT:    andl $60, %edx
 ; X86-SHLD-NEXT:    movl (%esp,%edx), %ebx
@@ -2004,17 +2182,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
@@ -2045,17 +2223,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    pushq %rax
-; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    andl $56, %ecx
 ; X64-NO-BMI2-NEXT:    andl $56, %esi
@@ -2074,17 +2252,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    pushq %rax
-; X64-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    xorps %xmm0, %xmm0
+; X64-BMI2-NEXT:    movups (%rdi), %xmm1
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm2
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-BMI2-NEXT:    andl $56, %eax
 ; X64-BMI2-NEXT:    andl $56, %esi
@@ -2106,17 +2284,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
@@ -2139,17 +2317,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-SHLD-NEXT:    movups 16(%ecx), %xmm2
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
 ; X86-SHLD-NEXT:    andl $60, %edx
 ; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
@@ -2168,17 +2346,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
@@ -2208,17 +2386,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    pushq %rax
-; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm2
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    andl $56, %ecx
 ; X64-NO-BMI2-NEXT:    andl $56, %esi
@@ -2237,17 +2415,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    pushq %rax
-; X64-BMI2-NEXT:    movups (%rdi), %xmm0
-; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    xorps %xmm0, %xmm0
+; X64-BMI2-NEXT:    movups (%rdi), %xmm1
+; X64-BMI2-NEXT:    movups 16(%rdi), %xmm2
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-BMI2-NEXT:    andl $56, %eax
 ; X64-BMI2-NEXT:    andl $56, %esi
@@ -2269,17 +2447,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
@@ -2302,17 +2480,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-SHLD-NEXT:    movups 16(%ecx), %xmm2
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
 ; X86-SHLD-NEXT:    andl $60, %edx
 ; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
@@ -2331,17 +2509,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
@@ -2371,17 +2549,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %rax
@@ -2399,17 +2577,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-SHLD:       # %bb.0:
 ; X64-SHLD-NEXT:    pushq %rax
-; X64-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-SHLD-NEXT:    movups 16(%rdi), %xmm2
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-SHLD-NEXT:    andl $56, %esi
 ; X64-SHLD-NEXT:    movq -128(%rsp,%rsi), %rax
@@ -2423,17 +2601,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
@@ -2455,17 +2633,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %esi
@@ -2506,17 +2684,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-SHLD-NEXT:    movups 16(%edx), %xmm2
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %esi
 ; X86-SHLD-NEXT:    andl $60, %esi
 ; X86-SHLD-NEXT:    movl 8(%esp,%esi), %edi
@@ -2545,17 +2723,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
@@ -2595,17 +2773,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
@@ -2634,17 +2812,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %edi
@@ -2670,17 +2848,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
@@ -2706,17 +2884,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
@@ -2745,17 +2923,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $156, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi), %ebx
@@ -2816,17 +2994,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-SHLD-NEXT:    subl $156, %esp
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movups (%eax), %xmm0
-; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movups (%eax), %xmm1
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm2
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %edi
 ; X86-SHLD-NEXT:    andl $60, %edi
 ; X86-SHLD-NEXT:    movl 24(%esp,%edi), %esi
@@ -2864,17 +3042,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
@@ -2931,17 +3109,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
@@ -2993,17 +3171,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %edi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
@@ -3046,17 +3224,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
@@ -3097,17 +3275,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %ecx
@@ -3146,17 +3324,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $172, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebx
@@ -3257,17 +3435,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-SHLD-NEXT:    subl $156, %esp
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movups (%eax), %xmm0
-; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movups (%eax), %xmm1
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm2
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %edi
 ; X86-SHLD-NEXT:    andl $60, %edi
 ; X86-SHLD-NEXT:    movl 24(%esp,%edi), %edx
@@ -3324,17 +3502,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 8d36eef952a2b..c4c87086dc359 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -168,8 +168,8 @@ define void @load_2byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
@@ -188,17 +188,15 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
@@ -215,13 +213,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
@@ -236,14 +232,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
@@ -260,23 +253,19 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %dl, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -292,8 +281,8 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
@@ -312,17 +301,15 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
@@ -339,18 +326,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movw %dx, (%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
@@ -360,14 +345,11 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
@@ -386,18 +368,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %dx, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
@@ -413,8 +393,8 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
@@ -433,17 +413,15 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
@@ -460,18 +438,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
@@ -481,14 +457,11 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
@@ -507,18 +480,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
@@ -536,8 +507,8 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
@@ -557,8 +528,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -571,8 +542,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
@@ -591,8 +562,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -610,8 +581,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
@@ -639,8 +610,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
@@ -664,8 +635,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
@@ -698,8 +669,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
@@ -719,8 +690,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -733,8 +704,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
@@ -753,8 +724,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -772,8 +743,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
@@ -801,8 +772,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
@@ -826,8 +797,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
@@ -859,8 +830,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
@@ -880,8 +851,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -894,8 +865,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
@@ -914,8 +885,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -933,8 +904,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
@@ -962,8 +933,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
@@ -987,8 +958,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
@@ -1020,8 +991,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
@@ -1041,8 +1012,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -1055,8 +1026,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
@@ -1075,8 +1046,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
@@ -1097,8 +1068,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
@@ -1141,8 +1112,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
@@ -1175,8 +1146,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
@@ -1222,9 +1193,9 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
 ; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
 ; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1241,9 +1212,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
 ; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
 ; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1263,9 +1234,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1294,9 +1265,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1321,9 +1292,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1357,9 +1328,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
 ; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
 ; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1382,9 +1353,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
 ; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
 ; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1411,9 +1382,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1442,9 +1413,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1469,9 +1440,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1504,9 +1475,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
 ; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
 ; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1529,9 +1500,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
 ; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
 ; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1558,9 +1529,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1589,9 +1560,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1616,9 +1587,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1651,9 +1622,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1675,9 +1646,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
 ; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1695,9 +1666,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1725,9 +1696,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1771,9 +1742,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1807,9 +1778,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1854,9 +1825,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1888,9 +1859,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
 ; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1917,9 +1888,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1947,9 +1918,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
 ; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1982,9 +1953,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -2049,9 +2020,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X86-SHLD-NEXT:    subl $92, %esp
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -2093,9 +2064,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)



More information about the llvm-commits mailing list