[llvm] [DAGCombiner][X86] Push bitcast/ext through freeze for loads (PR #163070)
Guy David via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 26 09:40:21 PDT 2025
https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/163070
>From 9373e0b0bb04ecdb9d661f26fb8b14597fc87842 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Tue, 14 Oct 2025 21:10:44 +0300
Subject: [PATCH 1/2] [DAGCombiner][X86] Push bitcast/ext through freeze for
loads
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 17 +++
llvm/lib/Target/X86/X86ISelLowering.cpp | 14 +++
.../AArch64/freeze-bitcast-ext-load.ll | 119 ++++++++++++++++++
.../test/CodeGen/X86/avx10_2_512bf16-arith.ll | 2 +-
llvm/test/CodeGen/X86/avx10_2bf16-arith.ll | 4 +-
llvm/test/CodeGen/X86/avx512-ext.ll | 32 ++---
...ad-of-small-alloca-with-zero-upper-half.ll | 12 +-
7 files changed, 171 insertions(+), 29 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6bf9008c3d677..06a8c832fe4a2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16944,6 +16944,23 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
return SDValue();
+ // fold: bitcast(freeze(load)) -> freeze(bitcast(load))
+ // fold: sext(freeze(load)) -> freeze(sext(load))
+ // fold: zext(freeze(load)) -> freeze(zext(load))
+ // This allows the conversion to potentially fold into the load.
+ if (N0.getOpcode() == ISD::LOAD && N->hasOneUse()) {
+ SDNode *User = *N->user_begin();
+ unsigned UserOpcode = User->getOpcode();
+ if (UserOpcode == ISD::BITCAST || UserOpcode == ISD::SIGN_EXTEND ||
+ UserOpcode == ISD::ZERO_EXTEND) {
+ SDValue NewConv =
+ DAG.getNode(UserOpcode, SDLoc(User), User->getValueType(0), N0);
+ SDValue FrozenConv = DAG.getFreeze(NewConv);
+ DAG.ReplaceAllUsesWith(User, FrozenConv.getNode());
+ return SDValue(N, 0);
+ }
+ }
+
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
// Try to push freeze through instructions that propagate but don't produce
// poison as far as possible. If an operand of freeze follows three
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b5f8ee50cba3d..5b677f6692ea6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3448,6 +3448,20 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
return false;
+ // With low alignment, don't convert integer vectors to large scalar loads,
+ // because otherwise they get broken into many small scalar loads.
+ if (LoadVT.isVector() && LoadVT.isInteger() && !BitcastVT.isVector() &&
+ BitcastVT.isInteger()) {
+ const DataLayout &DL = DAG.getDataLayout();
+ unsigned MinAlign = DL.getPointerSize();
+ // Aligned well, will legalize into a clean sequence of loads.
+ if (MMO.getAlign() >= MinAlign)
+ return true;
+ // Aligned poorly for a large enough scalar.
+ if (BitcastVT.getSizeInBits() > 2 * DL.getPointerSizeInBits())
+ return false;
+ }
+
// If both types are legal vectors, it's always ok to convert them.
if (LoadVT.isVector() && BitcastVT.isVector() &&
isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
diff --git a/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll b/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll
new file mode 100644
index 0000000000000..8124d35b063a7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+define double @test_bitcast_freeze_load(ptr %p) {
+; CHECK-LABEL: test_bitcast_freeze_load:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ret
+ %v = load <2 x float>, ptr %p
+ %f = freeze <2 x float> %v
+ %b = bitcast <2 x float> %f to double
+ ret double %b
+}
+
+define i32 @test_sext_freeze_load_i8(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrsb w0, [x0]
+; CHECK-NEXT: ret
+ %v = load i8, ptr %p
+ %f = freeze i8 %v
+ %e = sext i8 %f to i32
+ ret i32 %e
+}
+
+define i64 @test_sext_freeze_load_i32(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: sxtw x0, w8
+; CHECK-NEXT: ret
+ %v = load i32, ptr %p
+ %f = freeze i32 %v
+ %e = sext i32 %f to i64
+ ret i64 %e
+}
+
+define i64 @test_sext_freeze_load_i16(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrsh x0, [x0]
+; CHECK-NEXT: ret
+ %v = load i16, ptr %p
+ %f = freeze i16 %v
+ %e = sext i16 %f to i64
+ ret i64 %e
+}
+
+define i32 @test_zext_freeze_load_i8(ptr %p) {
+; CHECK-LABEL: test_zext_freeze_load_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w0, [x0]
+; CHECK-NEXT: ret
+ %v = load i8, ptr %p
+ %f = freeze i8 %v
+ %e = zext i8 %f to i32
+ ret i32 %e
+}
+
+define i64 @test_zext_freeze_load_i32(ptr %p) {
+; CHECK-LABEL: test_zext_freeze_load_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w0, [x0]
+; CHECK-NEXT: ret
+ %v = load i32, ptr %p
+ %f = freeze i32 %v
+ %e = zext i32 %f to i64
+ ret i64 %e
+}
+
+define i64 @test_zext_freeze_load_i16(ptr %p) {
+; CHECK-LABEL: test_zext_freeze_load_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w0, [x0]
+; CHECK-NEXT: ret
+ %v = load i16, ptr %p
+ %f = freeze i16 %v
+ %e = zext i16 %f to i64
+ ret i64 %e
+}
+
+define i32 @test_sext_freeze_load_multiuse(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_multiuse:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0]
+; CHECK-NEXT: sxtb w9, w8
+; CHECK-NEXT: add w0, w9, w8, uxtb
+; CHECK-NEXT: ret
+ %v = load i8, ptr %p
+ %f = freeze i8 %v
+ %e = sext i8 %f to i32
+ %z = zext i8 %f to i32
+ %r = add i32 %e, %z
+ ret i32 %r
+}
+
+define <4 x i32> @test_sext_freeze_load_v4i16(ptr %p) {
+; CHECK-LABEL: test_sext_freeze_load_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: ret
+ %v = load <4 x i16>, ptr %p
+ %f = freeze <4 x i16> %v
+ %e = sext <4 x i16> %f to <4 x i32>
+ ret <4 x i32> %e
+}
+
+define <4 x i32> @test_zext_freeze_load_v4i16(ptr %p) {
+; CHECK-LABEL: test_zext_freeze_load_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ret
+ %v = load <4 x i16>, ptr %p
+ %f = freeze <4 x i16> %v
+ %e = zext <4 x i16> %f to <4 x i32>
+ ret <4 x i32> %e
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
index 79849a7153c91..d9b4635042256 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
; X86: # %bb.0:
-; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
index 0f2c75b15d5b4..01b7618753a23 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
; X86: # %bb.0:
-; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
@@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
; X86: # %bb.0:
-; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index 1a712ffac5b7e..03f283a57a217 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -212,11 +212,9 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vmovdqu (%rdi), %ymm2
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
@@ -237,11 +235,9 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT: vmovdqu (%rdi), %ymm2
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
@@ -261,11 +257,9 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vmovdqu (%rdi), %ymm2
-; KNL-NEXT: vpmovsxbw %xmm2, %ymm3
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
-; KNL-NEXT: vpmovsxbw %xmm2, %ymm2
-; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; KNL-NEXT: vpmovsxbw (%rdi), %ymm2
+; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm3
+; KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
@@ -286,11 +280,9 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT: vmovdqu (%rdi), %ymm2
-; AVX512DQNOBW-NEXT: vpmovsxbw %xmm2, %ymm3
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQNOBW-NEXT: vpmovsxbw %xmm2, %ymm2
-; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm2
+; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm3
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 81c4d5d71084c..fce622a99bb6a 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -171,8 +171,8 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: movl (%rdi), %eax
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movb %al, (%rdx)
@@ -180,8 +180,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: movl (%rdi), %eax
+; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movb %al, (%rdx)
; X64-BMI2-NEXT: retq
@@ -248,8 +248,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: movl (%rdi), %eax
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
@@ -257,8 +257,8 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: movl (%rdi), %eax
+; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movw %ax, (%rdx)
; X64-BMI2-NEXT: retq
@@ -324,8 +324,8 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: movl (%rdi), %eax
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
@@ -333,8 +333,8 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: movl (%rdi), %eax
+; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movl %eax, (%rdx)
; X64-BMI2-NEXT: retq
>From c396c3dcf7ce3ffff9279cac668f0d7233102ead Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Sun, 26 Oct 2025 17:57:21 +0200
Subject: [PATCH 2/2] Address review comment
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 45 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 18 +-
.../AArch64/freeze-bitcast-ext-load.ll | 3 +-
llvm/test/CodeGen/AArch64/freeze.ll | 12 +-
llvm/test/CodeGen/AArch64/pr66603.ll | 3 +-
llvm/test/CodeGen/AArch64/vector-compress.ll | 22 +-
llvm/test/CodeGen/AArch64/vselect-ext.ll | 24 +-
.../AMDGPU/amdgpu-codegenprepare-idiv.ll | 8 +-
.../atomic_optimizations_local_pointer.ll | 240 +--
llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 592 +++----
llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 59 +-
llvm/test/CodeGen/AMDGPU/frem.ll | 72 +-
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 160 +-
llvm/test/CodeGen/AMDGPU/mul_int24.ll | 74 +-
llvm/test/CodeGen/AMDGPU/select-undef.ll | 3 +-
.../test/CodeGen/AMDGPU/vector-reduce-smax.ll | 12 +-
.../test/CodeGen/AMDGPU/vector-reduce-smin.ll | 12 +-
.../test/CodeGen/AMDGPU/vector-reduce-umax.ll | 12 +-
.../test/CodeGen/AMDGPU/vector-reduce-umin.ll | 12 +-
llvm/test/CodeGen/RISCV/pr66603.ll | 4 -
llvm/test/CodeGen/SystemZ/pr60413.ll | 34 +-
.../test/CodeGen/Thumb2/mve-pred-selectop3.ll | 4 +-
llvm/test/CodeGen/X86/avx512-ext.ll | 28 +-
llvm/test/CodeGen/X86/freeze-binary.ll | 22 +-
llvm/test/CodeGen/X86/freeze.ll | 2 -
llvm/test/CodeGen/X86/midpoint-int-vec-512.ll | 84 +-
llvm/test/CodeGen/X86/movmsk-cmp.ll | 4 +-
llvm/test/CodeGen/X86/pr162812.ll | 50 +-
llvm/test/CodeGen/X86/ushl_sat_vec.ll | 2 +-
llvm/test/CodeGen/X86/var-permute-128.ll | 66 +-
llvm/test/CodeGen/X86/vector-compress.ll | 1165 +++++++-------
...ad-of-small-alloca-with-zero-upper-half.ll | 1408 ++++++++++-------
.../CodeGen/X86/widen-load-of-small-alloca.ll | 265 ++--
33 files changed, 2284 insertions(+), 2237 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 06a8c832fe4a2..ccb4a70e4dc23 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14815,6 +14815,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level))
return Res;
+ if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse() && !VT.isVector()) {
+ SDValue Res =
+ DAG.getFreeze(DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0)));
+ return DAG.getNode(ISD::AssertSext, DL, VT, Res,
+ DAG.getValueType(N0.getOperand(0).getValueType()));
+ }
+
return SDValue();
}
@@ -15194,6 +15201,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
return SDValue(CSENode, 0);
}
+ if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse() && !VT.isVector()) {
+ SDValue Res =
+ DAG.getFreeze(DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)));
+ return DAG.getNode(ISD::AssertZext, DL, VT, Res,
+ DAG.getValueType(N0.getOperand(0).getValueType()));
+ }
+
return SDValue();
}
@@ -15362,6 +15376,10 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level))
return Res;
+ if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse())
+ return DAG.getFreeze(
+ DAG.getNode(ISD::ANY_EXTEND, DL, VT, N0.getOperand(0)));
+
return SDValue();
}
@@ -16911,6 +16929,11 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
return LegalShuffle;
}
+ if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse()) {
+ SDLoc DL(N);
+ return DAG.getFreeze(DAG.getNode(ISD::BITCAST, DL, VT, N0.getOperand(0)));
+ }
+
return SDValue();
}
@@ -16943,23 +16966,11 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
// example https://reviews.llvm.org/D136529#4120959.
if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
return SDValue();
-
- // fold: bitcast(freeze(load)) -> freeze(bitcast(load))
- // fold: sext(freeze(load)) -> freeze(sext(load))
- // fold: zext(freeze(load)) -> freeze(zext(load))
- // This allows the conversion to potentially fold into the load.
- if (N0.getOpcode() == ISD::LOAD && N->hasOneUse()) {
- SDNode *User = *N->user_begin();
- unsigned UserOpcode = User->getOpcode();
- if (UserOpcode == ISD::BITCAST || UserOpcode == ISD::SIGN_EXTEND ||
- UserOpcode == ISD::ZERO_EXTEND) {
- SDValue NewConv =
- DAG.getNode(UserOpcode, SDLoc(User), User->getValueType(0), N0);
- SDValue FrozenConv = DAG.getFreeze(NewConv);
- DAG.ReplaceAllUsesWith(User, FrozenConv.getNode());
- return SDValue(N, 0);
- }
- }
+ // Avoid folding extensions and bitcasts. Each of these operations handles
+ // FREEZE in their own respective visitors.
+ if (N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND ||
+ N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::BITCAST)
+ return SDValue();
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
// Try to push freeze through instructions that propagate but don't produce
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5b677f6692ea6..d5c4235d2c5a0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3448,19 +3448,11 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
return false;
- // With low alignment, don't convert integer vectors to large scalar loads,
- // because otherwise they get broken into many small scalar loads.
- if (LoadVT.isVector() && LoadVT.isInteger() && !BitcastVT.isVector() &&
- BitcastVT.isInteger()) {
- const DataLayout &DL = DAG.getDataLayout();
- unsigned MinAlign = DL.getPointerSize();
- // Aligned well, will legalize into a clean sequence of loads.
- if (MMO.getAlign() >= MinAlign)
- return true;
- // Aligned poorly for a large enough scalar.
- if (BitcastVT.getSizeInBits() > 2 * DL.getPointerSizeInBits())
- return false;
- }
+ // If we have a large vector type (even if illegal), don't bitcast to large
+ // (illegal) scalar types. Better to load fewer vectors and extract.
+ if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() &&
+ BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0)
+ return false;
// If both types are legal vectors, it's always ok to convert them.
if (LoadVT.isVector() && BitcastVT.isVector() &&
diff --git a/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll b/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll
index 8124d35b063a7..361005dfb8664 100644
--- a/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll
+++ b/llvm/test/CodeGen/AArch64/freeze-bitcast-ext-load.ll
@@ -26,8 +26,7 @@ define i32 @test_sext_freeze_load_i8(ptr %p) {
define i64 @test_sext_freeze_load_i32(ptr %p) {
; CHECK-LABEL: test_sext_freeze_load_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr w8, [x0]
-; CHECK-NEXT: sxtw x0, w8
+; CHECK-NEXT: ldrsw x0, [x0]
; CHECK-NEXT: ret
%v = load i32, ptr %p
%f = freeze i32 %v
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
index fb909fec90434..5920de998977a 100644
--- a/llvm/test/CodeGen/AArch64/freeze.ll
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -376,10 +376,14 @@ define i32 @freeze_anonstruct() {
}
define i32 @freeze_anonstruct2() {
-; CHECK-LABEL: freeze_anonstruct2:
-; CHECK: // %bb.0:
-; CHECK-NEXT: add w0, w8, w8, uxth
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: freeze_anonstruct2:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: freeze_anonstruct2:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: add w0, w8, w8, uxth
+; CHECK-GI-NEXT: ret
%y1 = freeze {i32, i16} undef
%v1 = extractvalue {i32, i16} %y1, 0
%v2 = extractvalue {i32, i16} %y1, 1
diff --git a/llvm/test/CodeGen/AArch64/pr66603.ll b/llvm/test/CodeGen/AArch64/pr66603.ll
index 2373b722fa04b..c265a9d5606f3 100644
--- a/llvm/test/CodeGen/AArch64/pr66603.ll
+++ b/llvm/test/CodeGen/AArch64/pr66603.ll
@@ -5,8 +5,7 @@
define i32 @PR66603(double %x) nounwind {
; CHECK-LABEL: PR66603:
; CHECK: // %bb.0:
-; CHECK-NEXT: fcvtzs w8, d0
-; CHECK-NEXT: sxtb w0, w8
+; CHECK-NEXT: fcvtzs w0, d0
; CHECK-NEXT: ret
%as_i8 = fptosi double %x to i8
%frozen_i8 = freeze i8 %as_i8
diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll
index 55c343164a1b8..78f5843442422 100644
--- a/llvm/test/CodeGen/AArch64/vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/vector-compress.ll
@@ -12,15 +12,16 @@ define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) {
; CHECK-NEXT: shl.4s v1, v1, #31
; CHECK-NEXT: cmlt.4s v1, v1, #0
; CHECK-NEXT: mov.s w9, v1[1]
-; CHECK-NEXT: fmov w11, s1
; CHECK-NEXT: mov.s w10, v1[2]
-; CHECK-NEXT: and x12, x11, #0x1
+; CHECK-NEXT: fmov w11, s1
; CHECK-NEXT: bfi x8, x11, #2, #1
-; CHECK-NEXT: mov x11, sp
+; CHECK-NEXT: and x11, x11, #0x1
; CHECK-NEXT: and x9, x9, #0x1
-; CHECK-NEXT: add x9, x12, x9
+; CHECK-NEXT: and w10, w10, #0x1
+; CHECK-NEXT: add x9, x11, x9
+; CHECK-NEXT: mov x11, sp
; CHECK-NEXT: st1.s { v0 }[1], [x8]
-; CHECK-NEXT: sub w10, w9, w10
+; CHECK-NEXT: add w10, w9, w10
; CHECK-NEXT: orr x9, x11, x9, lsl #2
; CHECK-NEXT: bfi x11, x10, #2, #2
; CHECK-NEXT: st1.s { v0 }[2], [x9]
@@ -420,15 +421,16 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
; CHECK-NEXT: shl.4s v1, v1, #31
; CHECK-NEXT: cmlt.4s v1, v1, #0
; CHECK-NEXT: mov.s w8, v1[1]
-; CHECK-NEXT: fmov w10, s1
; CHECK-NEXT: mov.s w9, v1[2]
-; CHECK-NEXT: and x12, x10, #0x1
+; CHECK-NEXT: fmov w10, s1
; CHECK-NEXT: bfi x11, x10, #2, #1
-; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: and x10, x10, #0x1
; CHECK-NEXT: and x8, x8, #0x1
-; CHECK-NEXT: add x8, x12, x8
+; CHECK-NEXT: and w9, w9, #0x1
+; CHECK-NEXT: add x8, x10, x8
+; CHECK-NEXT: mov x10, sp
; CHECK-NEXT: st1.s { v0 }[1], [x11]
-; CHECK-NEXT: sub w9, w8, w9
+; CHECK-NEXT: add w9, w8, w9
; CHECK-NEXT: orr x8, x10, x8, lsl #2
; CHECK-NEXT: bfi x10, x9, #2, #2
; CHECK-NEXT: st1.s { v0 }[2], [x8]
diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll
index 4f2b9c5a62669..c61c59068a319 100644
--- a/llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -594,10 +594,10 @@ define void @extension_in_loop_v16i8_to_v16i32(ptr %src, ptr %dst) {
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
; CHECK-NEXT: cmge.16b v5, v4, #0
-; CHECK-NEXT: tbl.16b v7, { v4 }, v0
-; CHECK-NEXT: tbl.16b v16, { v4 }, v1
-; CHECK-NEXT: tbl.16b v18, { v4 }, v2
-; CHECK-NEXT: tbl.16b v4, { v4 }, v3
+; CHECK-NEXT: tbl.16b v7, { v4 }, v3
+; CHECK-NEXT: tbl.16b v16, { v4 }, v2
+; CHECK-NEXT: tbl.16b v18, { v4 }, v1
+; CHECK-NEXT: tbl.16b v4, { v4 }, v0
; CHECK-NEXT: sshll2.8h v6, v5, #0
; CHECK-NEXT: sshll.8h v5, v5, #0
; CHECK-NEXT: sshll2.4s v17, v6, #0
@@ -664,10 +664,10 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(ptr %src, ptr %dst) {
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
; CHECK-NEXT: cmge.16b v5, v4, #0
-; CHECK-NEXT: tbl.16b v7, { v4 }, v0
-; CHECK-NEXT: tbl.16b v16, { v4 }, v1
-; CHECK-NEXT: tbl.16b v18, { v4 }, v2
-; CHECK-NEXT: tbl.16b v4, { v4 }, v3
+; CHECK-NEXT: tbl.16b v7, { v4 }, v3
+; CHECK-NEXT: tbl.16b v16, { v4 }, v2
+; CHECK-NEXT: tbl.16b v18, { v4 }, v1
+; CHECK-NEXT: tbl.16b v4, { v4 }, v0
; CHECK-NEXT: sshll2.8h v6, v5, #0
; CHECK-NEXT: sshll.8h v5, v5, #0
; CHECK-NEXT: sshll2.4s v17, v6, #0
@@ -735,10 +735,10 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(ptr %src, ptr %dst) {
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
; CHECK-NEXT: cmge.16b v5, v4, #0
-; CHECK-NEXT: tbl.16b v7, { v4 }, v0
-; CHECK-NEXT: tbl.16b v16, { v4 }, v1
-; CHECK-NEXT: tbl.16b v18, { v4 }, v2
-; CHECK-NEXT: tbl.16b v4, { v4 }, v3
+; CHECK-NEXT: tbl.16b v7, { v4 }, v3
+; CHECK-NEXT: tbl.16b v16, { v4 }, v2
+; CHECK-NEXT: tbl.16b v18, { v4 }, v1
+; CHECK-NEXT: tbl.16b v4, { v4 }, v0
; CHECK-NEXT: sshll2.8h v6, v5, #0
; CHECK-NEXT: sshll.8h v5, v5, #0
; CHECK-NEXT: sshll2.4s v17, v6, #0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 948811ea45f77..eacd960153c29 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7769,7 +7769,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
;
; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
@@ -7938,7 +7938,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
;
; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
; GFX9-NEXT: s_ashr_i32 s6, s1, 31
@@ -9037,7 +9037,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
;
; GFX6-LABEL: srem_i64_pow2_shl_denom:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
@@ -9208,7 +9208,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
;
; GFX9-LABEL: srem_i64_pow2_shl_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
; GFX9-NEXT: s_ashr_i32 s2, s1, 31
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 23c5f4f5506f3..d4b3f5c303467 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -11184,19 +11184,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX7LESS_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop
; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8
-; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
-; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s10
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
-; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
-; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7
; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9
-; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
-; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
@@ -11241,19 +11241,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX8_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7
; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
-; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s10
+; GFX8_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
-; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9
-; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
@@ -11297,19 +11297,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX9_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7
; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
-; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s10
+; GFX9_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
-; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9
-; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
@@ -13010,19 +13010,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX7LESS_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop
; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8
-; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
-; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s10
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
-; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
-; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7
; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9
-; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
-; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
@@ -13067,19 +13067,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX8_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7
; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
-; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s10
+; GFX8_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
-; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9
-; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
@@ -13123,19 +13123,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX9_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7
; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
-; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s10
+; GFX9_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
-; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9
-; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
@@ -14831,19 +14831,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX7LESS_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop
; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8
-; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
-; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s10
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
-; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
-; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7
; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
-; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9
-; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
-; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
@@ -14887,19 +14887,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX8_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7
; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
-; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
-; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s10
+; GFX8_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
-; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9
-; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
@@ -14942,19 +14942,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX9_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7
; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
-; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
-; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s10
+; GFX9_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
-; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9
-; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
@@ -16645,19 +16645,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX7LESS_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop
; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8
-; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
-; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s10
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
-; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
-; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7
; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
-; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9
-; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
-; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
+; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
@@ -16701,19 +16701,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX8_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
-; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7
; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
-; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
-; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s10
+; GFX8_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
-; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9
-; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
+; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
@@ -16756,19 +16756,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX9_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10
-; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s6
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s7
; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
-; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
-; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s10
+; GFX9_ITERATIVE-NEXT: s_and_b64 s[8:9], vcc, exec
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
-; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9
-; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
+; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 0fc54aeaef77b..c187aac4fc4a2 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -6,77 +6,77 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-LABEL: v_sdiv_v2i128_vv:
; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 0, v0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3
; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11
-; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
+; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v1, vcc
; SDAG-NEXT: v_mov_b32_e32 v26, v24
; SDAG-NEXT: v_mov_b32_e32 v27, v25
-; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v2, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v0, v16, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v1, v20
-; SDAG-NEXT: v_ffbh_u32_e32 v2, v21
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v0, v20, v16
-; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8
-; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v16
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v2, vcc
+; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v3, vcc
+; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e32 v17, v3, v17, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v16, v2, v16, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v21, v1, v20, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v20, v0, v18, vcc
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, 0, v8
; SDAG-NEXT: v_or_b32_e32 v1, v21, v17
-; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
-; SDAG-NEXT: v_min_u32_e32 v2, v19, v2
-; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22
+; SDAG-NEXT: v_or_b32_e32 v0, v20, v16
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v9, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v16
; SDAG-NEXT: v_ffbh_u32_e32 v22, v17
+; SDAG-NEXT: v_ffbh_u32_e32 v23, v20
+; SDAG-NEXT: v_ffbh_u32_e32 v28, v21
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7]
-; SDAG-NEXT: v_min_u32_e32 v1, v19, v22
-; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2
-; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[6:7]
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc
-; SDAG-NEXT: v_ffbh_u32_e32 v3, v29
-; SDAG-NEXT: v_ffbh_u32_e32 v19, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7]
-; SDAG-NEXT: v_or_b32_e32 v2, v29, v0
-; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3
-; SDAG-NEXT: v_ffbh_u32_e32 v11, v0
+; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 32, v18
+; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v23
+; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v11, vcc
+; SDAG-NEXT: v_min_u32_e32 v22, v1, v22
+; SDAG-NEXT: v_min_u32_e32 v18, v18, v28
+; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v11, v23, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v28, v9, v3, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v29, v8, v2, vcc
+; SDAG-NEXT: v_add_i32_e32 v8, vcc, 64, v18
+; SDAG-NEXT: v_addc_u32_e64 v9, s[6:7], 0, 0, vcc
; SDAG-NEXT: v_or_b32_e32 v3, v28, v1
-; SDAG-NEXT: v_min_u32_e32 v8, v8, v19
-; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11
-; SDAG-NEXT: v_ffbh_u32_e32 v19, v1
+; SDAG-NEXT: v_or_b32_e32 v2, v29, v0
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v22, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v10, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v11, v1
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v29
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v28
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: v_min_u32_e32 v2, v11, v19
-; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8
-; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7]
-; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v8, 0, s[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[6:7]
+; SDAG-NEXT: v_add_i32_e64 v2, s[6:7], 32, v10
+; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 32, v18
+; SDAG-NEXT: v_min_u32_e32 v2, v2, v11
+; SDAG-NEXT: v_min_u32_e32 v3, v3, v22
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v2, v10
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v9, vcc
+; SDAG-NEXT: v_add_i32_e32 v3, vcc, 64, v3
+; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v2, v8
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v10, v9, vcc
; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v8
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v19, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[8:9]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v19, vcc
; SDAG-NEXT: v_or_b32_e32 v2, v2, v10
; SDAG-NEXT: v_or_b32_e32 v3, v9, v11
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v19, v18, s[4:5]
; SDAG-NEXT: v_and_b32_e32 v2, 1, v2
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
@@ -1564,67 +1564,67 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3
-; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
; SDAG-NEXT: v_mov_b32_e32 v29, v28
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v18, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v18, v16
-; SDAG-NEXT: v_ffbh_u32_e32 v20, v17
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v3, vcc
; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v2, v16, v0
-; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v0
-; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
-; SDAG-NEXT: v_or_b32_e32 v3, v17, v1
-; SDAG-NEXT: v_min_u32_e32 v18, v18, v20
-; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v22
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v1
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
-; SDAG-NEXT: v_min_u32_e32 v3, v20, v22
-; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18
-; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v20, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v9, vcc
+; SDAG-NEXT: v_or_b32_e32 v17, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v16, v0, v2
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v2
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v3
+; SDAG-NEXT: v_ffbh_u32_e32 v23, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v24, v1
+; SDAG-NEXT: v_subb_u32_e32 v25, vcc, 0, v10, vcc
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[16:17]
+; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20
+; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v23
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v9, s[4:5]
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_min_u32_e32 v16, v16, v22
+; SDAG-NEXT: v_min_u32_e32 v17, v17, v24
+; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e32 v11, v11, v20, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v10, v10, v25, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v30, v9, v18, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v31, v8, v21, vcc
+; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17
+; SDAG-NEXT: v_addc_u32_e64 v18, s[6:7], 0, 0, vcc
+; SDAG-NEXT: v_or_b32_e32 v9, v30, v11
+; SDAG-NEXT: v_or_b32_e32 v8, v31, v10
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v10, v8, v3, vcc
-; SDAG-NEXT: v_ffbh_u32_e32 v9, v31
-; SDAG-NEXT: v_ffbh_u32_e32 v21, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v20, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v8, v31, v2
-; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v9
-; SDAG-NEXT: v_ffbh_u32_e32 v20, v2
-; SDAG-NEXT: v_or_b32_e32 v9, v30, v3
-; SDAG-NEXT: v_min_u32_e32 v11, v11, v21
-; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20
-; SDAG-NEXT: v_ffbh_u32_e32 v21, v3
+; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v17, v10
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v11
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v31
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v30
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT: v_min_u32_e32 v8, v20, v21
-; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11
-; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5]
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc
-; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10
+; SDAG-NEXT: v_add_i32_e64 v8, s[6:7], 32, v17
+; SDAG-NEXT: v_add_i32_e64 v9, s[6:7], 32, v21
+; SDAG-NEXT: v_min_u32_e32 v8, v8, v20
+; SDAG-NEXT: v_min_u32_e32 v9, v9, v22
+; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT: v_add_i32_e32 v9, vcc, 64, v9
+; SDAG-NEXT: v_addc_u32_e64 v17, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v8, v16
+; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v17, v18, vcc
+; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v16
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v19, vcc
; SDAG-NEXT: v_or_b32_e32 v8, v8, v18
-; SDAG-NEXT: v_or_b32_e32 v9, v11, v19
+; SDAG-NEXT: v_or_b32_e32 v9, v17, v19
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
@@ -1633,71 +1633,71 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v34, v1, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v34, v3, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v32, v2, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v27, v1, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v33, v16, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v33, v0, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB2_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
-; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10
-; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10
+; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v16
+; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v16
; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: v_mov_b32_e32 v9, 0
-; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v17, vcc
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[0:1], v20
; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc
-; SDAG-NEXT: v_or_b32_e32 v18, v32, v34
-; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10
-; SDAG-NEXT: v_or_b32_e32 v19, v33, v35
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24
-; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25
-; SDAG-NEXT: v_or_b32_e32 v11, v11, v19
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v18
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v17, v32, v34
+; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0x7f, v16
+; SDAG-NEXT: v_or_b32_e32 v18, v33, v35
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v19
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v19
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v19
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[17:18]
+; SDAG-NEXT: v_lshr_b64 v[16:17], v[0:1], v16
+; SDAG-NEXT: v_or_b32_e32 v17, v23, v17
+; SDAG-NEXT: v_or_b32_e32 v16, v22, v16
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v19
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v20, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v25, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v24, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v18, 0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB2_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT: v_lshr_b64 v[22:23], v[16:17], v32
+; SDAG-NEXT: v_lshr_b64 v[22:23], v[0:1], v32
; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 64, v32
; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32
-; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32
+; SDAG-NEXT: v_lshr_b64 v[24:25], v[2:3], v32
; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31
; SDAG-NEXT: v_mov_b32_e32 v18, 0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0
; SDAG-NEXT: v_mov_b32_e32 v9, 0
-; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v8
-; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[2:3], v8
+; SDAG-NEXT: v_lshr_b64 v[48:49], v[2:3], v37
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc
; SDAG-NEXT: v_or_b32_e32 v8, v23, v27
; SDAG-NEXT: v_or_b32_e32 v22, v22, v26
-; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v2, vcc
+; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v10, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32
; SDAG-NEXT: v_cndmask_b32_e64 v8, v49, v8, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v22, v48, v22, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5]
-; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v3, vcc
+; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v11, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
-; SDAG-NEXT: v_cndmask_b32_e32 v25, v8, v17, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v24, v22, v16, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v24, v22, v0, vcc
; SDAG-NEXT: v_mov_b32_e32 v22, 0
; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3
@@ -1707,13 +1707,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v25
; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v11
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v17
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
; SDAG-NEXT: v_or_b32_e32 v21, v23, v21
; SDAG-NEXT: v_or_b32_e32 v20, v22, v20
; SDAG-NEXT: v_or_b32_e32 v22, v26, v48
; SDAG-NEXT: v_or_b32_e32 v23, v24, v49
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v8
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v8
; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v36, v23
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v25, vcc
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v38, v22, vcc
@@ -1721,8 +1721,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; SDAG-NEXT: v_and_b32_e32 v24, v8, v31
; SDAG-NEXT: v_and_b32_e32 v26, v8, v30
-; SDAG-NEXT: v_and_b32_e32 v48, v8, v2
-; SDAG-NEXT: v_and_b32_e32 v49, v8, v3
+; SDAG-NEXT: v_and_b32_e32 v48, v8, v10
+; SDAG-NEXT: v_and_b32_e32 v49, v8, v11
; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v23, v24
; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v26, vcc
@@ -1735,9 +1735,9 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v22, v32, v34
; SDAG-NEXT: v_or_b32_e32 v23, v33, v35
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23]
-; SDAG-NEXT: v_or_b32_e32 v11, v19, v11
+; SDAG-NEXT: v_or_b32_e32 v17, v19, v17
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v10, v18, v10
+; SDAG-NEXT: v_or_b32_e32 v16, v18, v16
; SDAG-NEXT: v_mov_b32_e32 v23, v9
; SDAG-NEXT: v_mov_b32_e32 v22, v8
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
@@ -1746,123 +1746,123 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB2_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v22
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v22
; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT: v_or_b32_e32 v34, v19, v11
-; SDAG-NEXT: v_or_b32_e32 v32, v18, v10
+; SDAG-NEXT: v_or_b32_e32 v34, v19, v17
+; SDAG-NEXT: v_or_b32_e32 v32, v18, v16
; SDAG-NEXT: v_or_b32_e32 v27, v9, v21
; SDAG-NEXT: v_or_b32_e32 v33, v8, v20
; SDAG-NEXT: .LBB2_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7
; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
; SDAG-NEXT: v_mov_b32_e32 v35, v26
; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v6, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v6, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v9, v5, v9, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v8, v4, v8, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v10, v8
-; SDAG-NEXT: v_ffbh_u32_e32 v11, v9
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v16, s[4:5]
+; SDAG-NEXT: v_ffbh_u32_e32 v16, v8
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v9
; SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0, v12
; SDAG-NEXT: v_or_b32_e32 v6, v8, v4
; SDAG-NEXT: v_ffbh_u32_e32 v20, v4
-; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v10
+; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], 32, v16
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v13, vcc
; SDAG-NEXT: v_or_b32_e32 v7, v9, v5
; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v20
; SDAG-NEXT: v_ffbh_u32_e32 v22, v5
-; SDAG-NEXT: v_min_u32_e32 v10, v10, v11
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v14, vcc
+; SDAG-NEXT: v_min_u32_e32 v16, v16, v18
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v14, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
; SDAG-NEXT: v_cndmask_b32_e64 v36, v13, v21, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v37, v12, v19, s[4:5]
; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7]
; SDAG-NEXT: v_min_u32_e32 v7, v20, v22
-; SDAG-NEXT: v_add_i32_e64 v10, s[8:9], 64, v10
-; SDAG-NEXT: v_addc_u32_e64 v12, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT: v_subb_u32_e32 v13, vcc, 0, v15, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v11, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v11, v37
-; SDAG-NEXT: v_ffbh_u32_e32 v14, v36
+; SDAG-NEXT: v_add_i32_e64 v12, s[8:9], 64, v16
+; SDAG-NEXT: v_addc_u32_e64 v13, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v15, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v18, s[4:5]
+; SDAG-NEXT: v_ffbh_u32_e32 v14, v37
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v36
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v19, v10, v7, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v13, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v10, v37, v6
-; SDAG-NEXT: v_ffbh_u32_e32 v13, v6
-; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v11
-; SDAG-NEXT: v_or_b32_e32 v11, v36, v7
-; SDAG-NEXT: v_add_i32_e32 v13, vcc, 32, v13
-; SDAG-NEXT: v_ffbh_u32_e32 v20, v7
-; SDAG-NEXT: v_min_u32_e32 v14, v15, v14
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_min_u32_e32 v10, v13, v20
-; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 64, v14
-; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v13, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v20, v12, v7, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v16, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v12, v37, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v15, v6
+; SDAG-NEXT: v_add_i32_e32 v14, vcc, 32, v14
+; SDAG-NEXT: v_or_b32_e32 v13, v36, v7
+; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v15
+; SDAG-NEXT: v_ffbh_u32_e32 v16, v7
+; SDAG-NEXT: v_min_u32_e32 v14, v14, v18
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; SDAG-NEXT: v_min_u32_e32 v12, v15, v16
+; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], 64, v14
+; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5]
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
-; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v19
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v13, v12, vcc
-; SDAG-NEXT: v_xor_b32_e32 v12, 0x7f, v10
-; SDAG-NEXT: v_subb_u32_e32 v14, vcc, 0, v18, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v15, vcc, 0, v18, vcc
-; SDAG-NEXT: v_or_b32_e32 v12, v12, v14
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
+; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v12, v20
+; SDAG-NEXT: v_subb_u32_e32 v13, vcc, v14, v19, vcc
+; SDAG-NEXT: v_xor_b32_e32 v14, 0x7f, v12
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v17, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[12:13]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v17, vcc
+; SDAG-NEXT: v_or_b32_e32 v14, v14, v16
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v15, v13, v17
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v13, v11, v15
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; SDAG-NEXT: v_and_b32_e32 v12, 1, v18
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12
+; SDAG-NEXT: v_and_b32_e32 v14, 1, v18
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, v9, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, v8, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v15, v9, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v8, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB2_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
-; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v10
-; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v10
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
-; SDAG-NEXT: v_mov_b32_e32 v13, 0
-; SDAG-NEXT: v_addc_u32_e32 v39, vcc, 0, v11, vcc
+; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v12
+; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v12
+; SDAG-NEXT: v_mov_b32_e32 v14, 0
+; SDAG-NEXT: v_mov_b32_e32 v15, 0
+; SDAG-NEXT: v_addc_u32_e32 v39, vcc, 0, v13, vcc
; SDAG-NEXT: v_lshl_b64 v[18:19], v[8:9], v18
-; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc
-; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v15, vcc
-; SDAG-NEXT: v_or_b32_e32 v14, v38, v48
-; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 0x7f, v10
-; SDAG-NEXT: v_or_b32_e32 v15, v39, v49
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[4:5], v22
+; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v16, vcc
+; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v17, vcc
+; SDAG-NEXT: v_or_b32_e32 v16, v38, v48
+; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 0x7f, v12
+; SDAG-NEXT: v_or_b32_e32 v17, v39, v49
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[4:5], v22
; SDAG-NEXT: v_sub_i32_e32 v23, vcc, 64, v22
; SDAG-NEXT: v_lshl_b64 v[20:21], v[8:9], v22
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT: v_lshr_b64 v[14:15], v[8:9], v23
-; SDAG-NEXT: v_or_b32_e32 v11, v11, v15
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v14
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_lshr_b64 v[16:17], v[8:9], v23
+; SDAG-NEXT: v_or_b32_e32 v13, v13, v17
+; SDAG-NEXT: v_or_b32_e32 v12, v12, v16
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22
-; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v10, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v21, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v20, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v19, v13, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v12, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v21, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v20, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
-; SDAG-NEXT: v_cndmask_b32_e64 v15, v14, v5, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v14, v18, v4, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v16, v5, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v18, v4, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v18, 0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -1870,52 +1870,52 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_cbranch_execz .LBB2_11
; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
; SDAG-NEXT: v_lshr_b64 v[20:21], v[8:9], v38
-; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 64, v38
+; SDAG-NEXT: v_sub_i32_e32 v14, vcc, 64, v38
; SDAG-NEXT: v_subrev_i32_e32 v51, vcc, 64, v38
; SDAG-NEXT: v_lshr_b64 v[22:23], v[4:5], v38
; SDAG-NEXT: v_add_i32_e32 v50, vcc, -1, v37
; SDAG-NEXT: v_mov_b32_e32 v18, 0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v13, 0
-; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v12
+; SDAG-NEXT: v_mov_b32_e32 v15, 0
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v14
; SDAG-NEXT: v_lshr_b64 v[53:54], v[4:5], v51
; SDAG-NEXT: v_addc_u32_e32 v51, vcc, -1, v36, vcc
-; SDAG-NEXT: v_or_b32_e32 v12, v21, v25
+; SDAG-NEXT: v_or_b32_e32 v14, v21, v25
; SDAG-NEXT: v_or_b32_e32 v20, v20, v24
; SDAG-NEXT: v_addc_u32_e32 v52, vcc, -1, v6, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v38
-; SDAG-NEXT: v_cndmask_b32_e64 v12, v54, v12, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v54, v14, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v20, v53, v20, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v23, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v22, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v53, vcc, -1, v7, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38
-; SDAG-NEXT: v_cndmask_b32_e32 v23, v12, v9, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v23, v14, v9, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v22, v20, v8, vcc
; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: .LBB2_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v12, 31, v23
+; SDAG-NEXT: v_lshrrev_b32_e32 v14, 31, v23
; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v15
-; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v11
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT: v_or_b32_e32 v24, v24, v12
+; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v17
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v13
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
+; SDAG-NEXT: v_or_b32_e32 v24, v24, v14
; SDAG-NEXT: v_or_b32_e32 v22, v22, v54
-; SDAG-NEXT: v_or_b32_e32 v12, v14, v55
-; SDAG-NEXT: v_or_b32_e32 v15, v19, v15
-; SDAG-NEXT: v_or_b32_e32 v11, v21, v11
-; SDAG-NEXT: v_or_b32_e32 v14, v18, v12
-; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v50, v22
-; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v51, v23, vcc
-; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v52, v24, vcc
-; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v53, v25, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v12
-; SDAG-NEXT: v_and_b32_e32 v12, 1, v21
+; SDAG-NEXT: v_or_b32_e32 v14, v16, v55
+; SDAG-NEXT: v_or_b32_e32 v17, v19, v17
+; SDAG-NEXT: v_or_b32_e32 v13, v21, v13
+; SDAG-NEXT: v_or_b32_e32 v16, v18, v14
+; SDAG-NEXT: v_sub_i32_e32 v14, vcc, v50, v22
+; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v51, v23, vcc
+; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v52, v24, vcc
+; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v53, v25, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v14
+; SDAG-NEXT: v_and_b32_e32 v14, 1, v21
; SDAG-NEXT: v_and_b32_e32 v54, v21, v7
; SDAG-NEXT: v_and_b32_e32 v55, v21, v6
; SDAG-NEXT: v_and_b32_e32 v40, v21, v36
@@ -1932,80 +1932,80 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v54, v38, v48
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[54:55]
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v10, v20, v10
-; SDAG-NEXT: v_mov_b32_e32 v21, v13
-; SDAG-NEXT: v_mov_b32_e32 v20, v12
+; SDAG-NEXT: v_or_b32_e32 v12, v20, v12
+; SDAG-NEXT: v_mov_b32_e32 v21, v15
+; SDAG-NEXT: v_mov_b32_e32 v20, v14
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB2_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB2_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v11
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT: v_or_b32_e32 v14, v14, v20
-; SDAG-NEXT: v_or_b32_e32 v19, v19, v15
-; SDAG-NEXT: v_or_b32_e32 v13, v13, v11
-; SDAG-NEXT: v_or_b32_e32 v18, v18, v14
-; SDAG-NEXT: v_or_b32_e32 v12, v12, v10
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v13
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
+; SDAG-NEXT: v_or_b32_e32 v19, v19, v17
+; SDAG-NEXT: v_or_b32_e32 v15, v15, v13
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v16
+; SDAG-NEXT: v_or_b32_e32 v14, v14, v12
; SDAG-NEXT: .LBB2_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT: v_mul_lo_u32 v14, v33, v3
-; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0
-; SDAG-NEXT: v_mul_lo_u32 v24, v27, v2
+; SDAG-NEXT: v_mul_lo_u32 v13, v33, v11
+; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v33, v10, 0
+; SDAG-NEXT: v_mul_lo_u32 v10, v27, v10
; SDAG-NEXT: v_mul_lo_u32 v25, v34, v31
; SDAG-NEXT: v_mul_lo_u32 v34, v32, v30
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0
-; SDAG-NEXT: v_mov_b32_e32 v15, 0
-; SDAG-NEXT: v_mul_lo_u32 v38, v12, v7
-; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v6, 0
-; SDAG-NEXT: v_mul_lo_u32 v39, v13, v6
-; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37
-; SDAG-NEXT: v_mul_lo_u32 v48, v18, v36
-; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v37, v12, 0
-; SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v14
-; SDAG-NEXT: v_mov_b32_e32 v14, v3
-; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15]
-; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v2
-; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v38
-; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v24
-; SDAG-NEXT: v_mov_b32_e32 v14, v22
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[14:15]
-; SDAG-NEXT: v_xor_b32_e32 v24, v16, v28
-; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v39
-; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11]
-; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v3
-; SDAG-NEXT: v_addc_u32_e64 v23, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v31, vcc, v17, v2, vcc
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21]
-; SDAG-NEXT: v_mov_b32_e32 v14, v7
-; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v36, v12, v[14:15]
+; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v31, v33, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mul_lo_u32 v38, v14, v7
+; SDAG-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v14, v6, 0
+; SDAG-NEXT: v_mul_lo_u32 v39, v15, v6
+; SDAG-NEXT: v_mul_lo_u32 v48, v19, v37
+; SDAG-NEXT: v_mul_lo_u32 v49, v18, v36
+; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v37, v14, 0
+; SDAG-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; SDAG-NEXT: v_mov_b32_e32 v19, v17
+; SDAG-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v30, v33, v[19:20]
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
+; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v22, v38
+; SDAG-NEXT: v_add_i32_e64 v12, s[4:5], v12, v10
+; SDAG-NEXT: v_mov_b32_e32 v19, v23
+; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v31, v27, v[19:20]
+; SDAG-NEXT: v_xor_b32_e32 v23, v0, v28
+; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v13, v39
+; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[11:12]
+; SDAG-NEXT: v_add_i32_e64 v12, s[4:5], v24, v17
+; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v1, v16, vcc
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v37, v[21:22]
+; SDAG-NEXT: v_mov_b32_e32 v19, v7
+; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v36, v14, v[19:20]
; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v25, v11
-; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v30, v27, v[22:23]
-; SDAG-NEXT: v_xor_b32_e32 v18, v31, v29
-; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3
-; SDAG-NEXT: v_mov_b32_e32 v14, v16
-; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[14:15]
+; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v30, v27, v[12:13]
+; SDAG-NEXT: v_xor_b32_e32 v18, v24, v29
+; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], v48, v1
+; SDAG-NEXT: v_mov_b32_e32 v19, v16
+; SDAG-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v37, v15, v[19:20]
; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v34, v7
-; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3
-; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v17, v15
-; SDAG-NEXT: v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10
+; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v49, v1
+; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v17, v14
+; SDAG-NEXT: v_addc_u32_e64 v17, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], v11, v10
; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v10, vcc
-; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v36, v13, v[15:16]
-; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc
-; SDAG-NEXT: v_xor_b32_e32 v7, v0, v28
-; SDAG-NEXT: v_add_i32_e32 v10, vcc, v10, v2
-; SDAG-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc
-; SDAG-NEXT: v_xor_b32_e32 v3, v1, v29
-; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v24, v28
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v2, v1, vcc
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v36, v15, v[16:17]
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; SDAG-NEXT: v_xor_b32_e32 v7, v10, v28
+; SDAG-NEXT: v_add_i32_e32 v10, vcc, v1, v0
+; SDAG-NEXT: v_addc_u32_e32 v11, vcc, v2, v19, vcc
+; SDAG-NEXT: v_xor_b32_e32 v3, v3, v29
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v23, v28
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v18, v29, vcc
; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v7, v28, vcc
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v29, vcc
; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v8, v6
-; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v14, vcc
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v13, vcc
; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26
; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc
; SDAG-NEXT: v_xor_b32_e32 v7, v7, v35
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index a025c36f620c7..7e233e648cdbc 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -211,22 +211,23 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, -1
+; SI-NEXT: v_mov_b32_e32 v1, 0x432fffff
; SI-NEXT: s_brev_b32 s8, -2
-; SI-NEXT: v_mov_b32_e32 v1, 0x43300000
-; SI-NEXT: v_mov_b32_e32 v0, 0
-; SI-NEXT: v_mov_b32_e32 v2, -1
-; SI-NEXT: v_mov_b32_e32 v3, 0x432fffff
+; SI-NEXT: v_mov_b32_e32 v3, 0x43300000
+; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v4, s3
+; SI-NEXT: v_bfi_b32 v3, s8, v3, v4
; SI-NEXT: v_mov_b32_e32 v6, s3
-; SI-NEXT: v_bfi_b32 v1, s8, v1, v6
; SI-NEXT: v_mov_b32_e32 v7, s2
-; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[0:1]
-; SI-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1]
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[2:3]
+; SI-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[0:1]
+; SI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -270,19 +271,21 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s9, 0x432fffff
; SI-NEXT: s_brev_b32 s10, -2
; SI-NEXT: v_mov_b32_e32 v6, 0x43300000
-; SI-NEXT: s_mov_b32 s9, 0x432fffff
; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: v_mov_b32_e32 v4, s8
; SI-NEXT: v_mov_b32_e32 v5, s9
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_bfi_b32 v1, s10, v6, v1
; SI-NEXT: v_mov_b32_e32 v7, s7
-; SI-NEXT: v_bfi_b32 v1, s10, v6, v7
; SI-NEXT: v_mov_b32_e32 v8, s6
; SI-NEXT: v_mov_b32_e32 v9, s5
-; SI-NEXT: v_mov_b32_e32 v10, s4
+; SI-NEXT: v_mov_b32_e32 v10, s5
+; SI-NEXT: v_mov_b32_e32 v11, s4
; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1]
; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1]
; SI-NEXT: v_bfi_b32 v1, s10, v6, v9
@@ -292,8 +295,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1]
; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -347,26 +350,30 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x11
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s13, 0x432fffff
; SI-NEXT: s_brev_b32 s14, -2
; SI-NEXT: v_mov_b32_e32 v10, 0x43300000
-; SI-NEXT: s_mov_b32 s13, 0x432fffff
; SI-NEXT: v_mov_b32_e32 v4, 0
; SI-NEXT: s_mov_b32 s12, s10
; SI-NEXT: v_mov_b32_e32 v8, s12
; SI-NEXT: v_mov_b32_e32 v9, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s3
+; SI-NEXT: v_bfi_b32 v5, s14, v10, v0
; SI-NEXT: v_mov_b32_e32 v2, s3
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v2
; SI-NEXT: v_mov_b32_e32 v6, s2
+; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_mov_b32_e32 v7, s1
; SI-NEXT: v_mov_b32_e32 v11, s0
; SI-NEXT: v_mov_b32_e32 v12, s7
-; SI-NEXT: v_mov_b32_e32 v13, s6
-; SI-NEXT: v_mov_b32_e32 v14, s5
-; SI-NEXT: v_mov_b32_e32 v15, s4
+; SI-NEXT: v_mov_b32_e32 v13, s7
+; SI-NEXT: v_mov_b32_e32 v14, s6
+; SI-NEXT: v_mov_b32_e32 v15, s5
+; SI-NEXT: v_mov_b32_e32 v16, s5
+; SI-NEXT: v_mov_b32_e32 v17, s4
; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5]
; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v7
+; SI-NEXT: v_bfi_b32 v5, s14, v10, v3
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9]
; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc
@@ -378,15 +385,15 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5]
; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v14
+; SI-NEXT: v_bfi_b32 v5, s14, v10, v15
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
-; SI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[4:5]
; SI-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, v4, v17, vcc
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; SI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 6f91222b2f396..a37e7dc5e31a9 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -4307,22 +4307,30 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; SI-NEXT: s_nop 1
; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; SI-NEXT: v_readfirstlane_b32 s0, v4
-; SI-NEXT: v_readfirstlane_b32 s1, v5
-; SI-NEXT: s_bfe_u32 s2, s1, 0xb0014
-; SI-NEXT: s_add_i32 s8, s2, 0xfffffc01
-; SI-NEXT: s_mov_b32 s3, 0xfffff
-; SI-NEXT: s_mov_b32 s2, s6
-; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8
-; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3]
-; SI-NEXT: s_and_b32 s9, s1, 0x80000000
+; SI-NEXT: v_readfirstlane_b32 s2, v4
+; SI-NEXT: v_readfirstlane_b32 s3, v5
+; SI-NEXT: s_bfe_u32 s0, s3, 0xb0014
+; SI-NEXT: s_add_i32 s8, s0, 0xfffffc01
+; SI-NEXT: s_mov_b32 s1, 0xfffff
+; SI-NEXT: s_mov_b32 s0, s6
+; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s8
+; SI-NEXT: v_not_b32_e32 v6, s0
+; SI-NEXT: v_and_b32_e32 v4, v4, v6
+; SI-NEXT: v_not_b32_e32 v6, s1
+; SI-NEXT: v_and_b32_e32 v5, v5, v6
+; SI-NEXT: s_and_b32 s0, s3, 0x80000000
; SI-NEXT: s_cmp_lt_i32 s8, 0
-; SI-NEXT: s_cselect_b32 s2, 0, s2
-; SI-NEXT: s_cselect_b32 s3, s9, s3
+; SI-NEXT: s_cselect_b64 vcc, -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; SI-NEXT: v_mov_b32_e32 v6, s0
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
; SI-NEXT: s_cmp_gt_i32 s8, 51
-; SI-NEXT: s_cselect_b32 s1, s1, s3
-; SI-NEXT: s_cselect_b32 s0, s0, s2
-; SI-NEXT: v_fma_f64 v[0:1], -s[0:1], v[2:3], v[0:1]
+; SI-NEXT: s_cselect_b64 vcc, -1, 0
+; SI-NEXT: v_mov_b32_e32 v6, s3
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; SI-NEXT: v_mov_b32_e32 v6, s2
+; SI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -4585,22 +4593,30 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: s_nop 1
; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; SI-NEXT: v_readfirstlane_b32 s0, v4
-; SI-NEXT: v_readfirstlane_b32 s1, v5
-; SI-NEXT: s_bfe_u32 s2, s1, 0xb0014
-; SI-NEXT: s_add_i32 s8, s2, 0xfffffc01
-; SI-NEXT: s_mov_b32 s3, 0xfffff
-; SI-NEXT: s_mov_b32 s2, s6
-; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8
-; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3]
-; SI-NEXT: s_and_b32 s9, s1, 0x80000000
+; SI-NEXT: v_readfirstlane_b32 s2, v4
+; SI-NEXT: v_readfirstlane_b32 s3, v5
+; SI-NEXT: s_bfe_u32 s0, s3, 0xb0014
+; SI-NEXT: s_add_i32 s8, s0, 0xfffffc01
+; SI-NEXT: s_mov_b32 s1, 0xfffff
+; SI-NEXT: s_mov_b32 s0, s6
+; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s8
+; SI-NEXT: v_not_b32_e32 v6, s0
+; SI-NEXT: v_and_b32_e32 v4, v4, v6
+; SI-NEXT: v_not_b32_e32 v6, s1
+; SI-NEXT: v_and_b32_e32 v5, v5, v6
+; SI-NEXT: s_and_b32 s0, s3, 0x80000000
; SI-NEXT: s_cmp_lt_i32 s8, 0
-; SI-NEXT: s_cselect_b32 s2, 0, s2
-; SI-NEXT: s_cselect_b32 s3, s9, s3
+; SI-NEXT: s_cselect_b64 vcc, -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; SI-NEXT: v_mov_b32_e32 v6, s0
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
; SI-NEXT: s_cmp_gt_i32 s8, 51
-; SI-NEXT: s_cselect_b32 s1, s1, s3
-; SI-NEXT: s_cselect_b32 s0, s0, s2
-; SI-NEXT: v_fma_f64 v[0:1], -s[0:1], v[2:3], v[0:1]
+; SI-NEXT: s_cselect_b64 vcc, -1, 0
+; SI-NEXT: v_mov_b32_e32 v6, s3
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; SI-NEXT: v_mov_b32_e32 v6, s2
+; SI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 6ae058b38e74f..c43a9ffa3d57d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -492,21 +492,21 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mul_hi_u32 v1, s1, v0
+; SI-NEXT: v_mov_b32_e32 v0, s3
+; SI-NEXT: v_mul_hi_i32 v1, s1, v0
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mul_hi_u32 v3, s1, v2
; SI-NEXT: s_mul_i32 s4, s1, s2
-; SI-NEXT: v_mov_b32_e32 v2, s3
-; SI-NEXT: v_mul_hi_u32 v3, s0, v2
-; SI-NEXT: s_mul_i32 s5, s0, s3
; SI-NEXT: v_mul_hi_u32 v0, s0, v0
-; SI-NEXT: v_mul_hi_i32 v2, s1, v2
+; SI-NEXT: s_mul_i32 s5, s0, s3
+; SI-NEXT: v_mul_hi_u32 v2, s0, v2
; SI-NEXT: s_mul_i32 s6, s1, s3
; SI-NEXT: s_mul_i32 s8, s0, s2
-; SI-NEXT: v_readfirstlane_b32 s9, v1
-; SI-NEXT: v_readfirstlane_b32 s10, v3
-; SI-NEXT: v_readfirstlane_b32 s11, v0
-; SI-NEXT: v_readfirstlane_b32 s12, v2
-; SI-NEXT: v_add_i32_e32 v0, vcc, s5, v0
+; SI-NEXT: v_readfirstlane_b32 s9, v3
+; SI-NEXT: v_readfirstlane_b32 s10, v0
+; SI-NEXT: v_readfirstlane_b32 s11, v2
+; SI-NEXT: v_readfirstlane_b32 s12, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, s5, v2
; SI-NEXT: s_add_u32 s5, s11, s5
; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v0
; SI-NEXT: s_addc_u32 s10, 0, s10
@@ -540,31 +540,31 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s0, s3
-; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX9-NEXT: s_add_u32 s9, s8, s7
+; GFX9-NEXT: s_mul_i32 s8, s0, s3
+; GFX9-NEXT: s_mul_hi_u32 s9, s0, s2
+; GFX9-NEXT: s_mul_hi_u32 s7, s0, s3
+; GFX9-NEXT: s_add_u32 s10, s9, s8
; GFX9-NEXT: s_mul_i32 s6, s1, s2
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX9-NEXT: s_add_u32 s9, s9, s6
-; GFX9-NEXT: s_mul_hi_i32 s10, s1, s3
-; GFX9-NEXT: s_addc_u32 s4, s5, s4
-; GFX9-NEXT: s_addc_u32 s5, s10, 0
-; GFX9-NEXT: s_mul_i32 s9, s1, s3
-; GFX9-NEXT: s_add_u32 s4, s4, s9
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_sub_u32 s9, s4, s2
-; GFX9-NEXT: s_subb_u32 s10, s5, 0
+; GFX9-NEXT: s_addc_u32 s7, 0, s7
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s2
+; GFX9-NEXT: s_add_u32 s10, s10, s6
+; GFX9-NEXT: s_mul_hi_i32 s4, s1, s3
+; GFX9-NEXT: s_addc_u32 s5, s7, s5
+; GFX9-NEXT: s_addc_u32 s4, s4, 0
+; GFX9-NEXT: s_mul_i32 s7, s1, s3
+; GFX9-NEXT: s_add_u32 s5, s5, s7
+; GFX9-NEXT: s_addc_u32 s4, 0, s4
+; GFX9-NEXT: s_sub_u32 s7, s5, s2
+; GFX9-NEXT: s_subb_u32 s10, s4, 0
; GFX9-NEXT: s_cmp_lt_i32 s1, 0
-; GFX9-NEXT: s_cselect_b32 s4, s9, s4
-; GFX9-NEXT: s_cselect_b32 s1, s10, s5
-; GFX9-NEXT: s_sub_u32 s9, s4, s0
+; GFX9-NEXT: s_cselect_b32 s1, s10, s4
+; GFX9-NEXT: s_cselect_b32 s4, s7, s5
+; GFX9-NEXT: s_sub_u32 s7, s4, s0
; GFX9-NEXT: s_subb_u32 s5, s1, 0
; GFX9-NEXT: s_cmp_lt_i32 s3, 0
; GFX9-NEXT: s_cselect_b32 s5, s5, s1
-; GFX9-NEXT: s_cselect_b32 s4, s9, s4
-; GFX9-NEXT: s_add_i32 s1, s8, s7
+; GFX9-NEXT: s_cselect_b32 s4, s7, s4
+; GFX9-NEXT: s_add_i32 s1, s9, s8
; GFX9-NEXT: s_add_i32 s1, s1, s6
; GFX9-NEXT: s_ashr_i32 s6, s1, 31
; GFX9-NEXT: s_mov_b32 s7, s6
@@ -581,33 +581,33 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s7, s0, s3
-; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT: s_mul_i32 s6, s1, s2
-; GFX10-NEXT: s_add_u32 s11, s8, s7
-; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_mul_hi_i32 s9, s1, s3
-; GFX10-NEXT: s_add_u32 s11, s11, s6
+; GFX10-NEXT: s_mul_hi_u32 s6, s0, s2
+; GFX10-NEXT: s_mul_i32 s9, s0, s3
+; GFX10-NEXT: s_mul_hi_u32 s8, s0, s3
+; GFX10-NEXT: s_mul_i32 s7, s1, s2
+; GFX10-NEXT: s_add_u32 s11, s6, s9
+; GFX10-NEXT: s_mul_hi_u32 s5, s1, s2
+; GFX10-NEXT: s_addc_u32 s8, 0, s8
+; GFX10-NEXT: s_mul_hi_i32 s4, s1, s3
+; GFX10-NEXT: s_add_u32 s11, s11, s7
; GFX10-NEXT: s_mul_i32 s10, s1, s3
-; GFX10-NEXT: s_addc_u32 s4, s5, s4
-; GFX10-NEXT: s_addc_u32 s5, s9, 0
-; GFX10-NEXT: s_add_u32 s4, s4, s10
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_sub_u32 s9, s4, s2
-; GFX10-NEXT: s_subb_u32 s10, s5, 0
+; GFX10-NEXT: s_addc_u32 s5, s8, s5
+; GFX10-NEXT: s_addc_u32 s4, s4, 0
+; GFX10-NEXT: s_add_u32 s5, s5, s10
+; GFX10-NEXT: s_addc_u32 s4, 0, s4
+; GFX10-NEXT: s_sub_u32 s8, s5, s2
+; GFX10-NEXT: s_subb_u32 s10, s4, 0
; GFX10-NEXT: s_cmp_lt_i32 s1, 0
-; GFX10-NEXT: s_cselect_b32 s1, s9, s4
-; GFX10-NEXT: s_cselect_b32 s4, s10, s5
-; GFX10-NEXT: s_sub_u32 s9, s1, s0
+; GFX10-NEXT: s_cselect_b32 s1, s8, s5
+; GFX10-NEXT: s_cselect_b32 s4, s10, s4
+; GFX10-NEXT: s_sub_u32 s8, s1, s0
; GFX10-NEXT: s_subb_u32 s5, s4, 0
; GFX10-NEXT: s_cmp_lt_i32 s3, 0
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: s_cselect_b32 s5, s5, s4
-; GFX10-NEXT: s_cselect_b32 s4, s9, s1
-; GFX10-NEXT: s_add_i32 s1, s8, s7
-; GFX10-NEXT: s_add_i32 s1, s1, s6
+; GFX10-NEXT: s_cselect_b32 s4, s8, s1
+; GFX10-NEXT: s_add_i32 s1, s6, s9
+; GFX10-NEXT: s_add_i32 s1, s1, s7
; GFX10-NEXT: s_ashr_i32 s6, s1, 31
; GFX10-NEXT: s_mov_b32 s7, s6
; GFX10-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
@@ -622,34 +622,34 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s7, s0, s3
-; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX11-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX11-NEXT: s_mul_i32 s6, s1, s2
-; GFX11-NEXT: s_add_u32 s11, s8, s7
-; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_mul_hi_i32 s9, s1, s3
-; GFX11-NEXT: s_add_u32 s11, s11, s6
+; GFX11-NEXT: s_mul_hi_u32 s6, s0, s2
+; GFX11-NEXT: s_mul_i32 s9, s0, s3
+; GFX11-NEXT: s_mul_hi_u32 s8, s0, s3
+; GFX11-NEXT: s_mul_i32 s7, s1, s2
+; GFX11-NEXT: s_add_u32 s11, s6, s9
+; GFX11-NEXT: s_mul_hi_u32 s5, s1, s2
+; GFX11-NEXT: s_addc_u32 s8, 0, s8
+; GFX11-NEXT: s_mul_hi_i32 s4, s1, s3
+; GFX11-NEXT: s_add_u32 s11, s11, s7
; GFX11-NEXT: s_mul_i32 s10, s1, s3
-; GFX11-NEXT: s_addc_u32 s4, s5, s4
-; GFX11-NEXT: s_addc_u32 s5, s9, 0
-; GFX11-NEXT: s_add_u32 s4, s4, s10
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_sub_u32 s9, s4, s2
-; GFX11-NEXT: s_subb_u32 s10, s5, 0
+; GFX11-NEXT: s_addc_u32 s5, s8, s5
+; GFX11-NEXT: s_addc_u32 s4, s4, 0
+; GFX11-NEXT: s_add_u32 s5, s5, s10
+; GFX11-NEXT: s_addc_u32 s4, 0, s4
+; GFX11-NEXT: s_sub_u32 s8, s5, s2
+; GFX11-NEXT: s_subb_u32 s10, s4, 0
; GFX11-NEXT: s_cmp_lt_i32 s1, 0
-; GFX11-NEXT: s_cselect_b32 s1, s9, s4
-; GFX11-NEXT: s_cselect_b32 s4, s10, s5
-; GFX11-NEXT: s_sub_u32 s9, s1, s0
+; GFX11-NEXT: s_cselect_b32 s1, s8, s5
+; GFX11-NEXT: s_cselect_b32 s4, s10, s4
+; GFX11-NEXT: s_sub_u32 s8, s1, s0
; GFX11-NEXT: s_subb_u32 s5, s4, 0
; GFX11-NEXT: s_cmp_lt_i32 s3, 0
; GFX11-NEXT: s_mul_i32 s0, s0, s2
; GFX11-NEXT: s_cselect_b32 s5, s5, s4
-; GFX11-NEXT: s_cselect_b32 s4, s9, s1
-; GFX11-NEXT: s_add_i32 s1, s8, s7
+; GFX11-NEXT: s_cselect_b32 s4, s8, s1
+; GFX11-NEXT: s_add_i32 s1, s6, s9
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s1, s1, s6
+; GFX11-NEXT: s_add_i32 s1, s1, s7
; GFX11-NEXT: s_ashr_i32 s6, s1, 31
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s7, s6
@@ -666,17 +666,17 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mul_hi_u32 s4, s0, s2
; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3
; GFX12-NEXT: s_mul_i32 s6, s0, s3
-; GFX12-NEXT: s_mul_hi_u32 s4, s0, s2
-; GFX12-NEXT: s_mul_i32 s10, s1, s2
+; GFX12-NEXT: s_mul_i32 s13, s1, s2
; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX12-NEXT: s_mul_hi_u32 s9, s1, s2
-; GFX12-NEXT: s_mul_hi_i32 s11, s1, s3
-; GFX12-NEXT: s_add_co_u32 s4, s6, s10
-; GFX12-NEXT: s_add_co_ci_u32 s4, s7, s9
+; GFX12-NEXT: s_mul_hi_u32 s12, s1, s2
+; GFX12-NEXT: s_mul_hi_i32 s9, s1, s3
+; GFX12-NEXT: s_add_co_u32 s4, s6, s13
+; GFX12-NEXT: s_add_co_ci_u32 s4, s7, s12
; GFX12-NEXT: s_mul_i32 s8, s1, s3
-; GFX12-NEXT: s_add_co_ci_u32 s9, s11, 0
+; GFX12-NEXT: s_add_co_ci_u32 s9, s9, 0
; GFX12-NEXT: s_cmp_lt_i32 s1, 0
; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[8:9]
; GFX12-NEXT: s_mov_b32 s4, s2
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 3d9c2a29cb9c1..2292105c14bc5 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -463,41 +463,39 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a,
define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 {
; SI-LABEL: test_smul24_i33:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_load_dword s6, s[4:5], 0xd
+; SI-NEXT: s_load_dword s4, s[4:5], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_bfe_i32 s0, s8, 0x180000
-; SI-NEXT: s_bfe_i32 s1, s2, 0x180000
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: s_mul_i32 s0, s1, s0
-; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s1, v0
-; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: s_bfe_i32 s5, s6, 0x180000
+; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: s_mul_i32 s5, s4, s5
+; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0
+; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smul24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT: s_load_dword s3, s[4:5], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s2, s2, 0x180000
-; VI-NEXT: s_bfe_i32 s3, s4, 0x180000
+; VI-NEXT: s_bfe_i32 s3, s3, 0x180000
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0
; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
-; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i33:
@@ -576,32 +574,30 @@ entry:
define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
; SI-LABEL: test_smulhi24_i33:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_load_dword s6, s[4:5], 0xd
+; SI-NEXT: s_load_dword s7, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_mov_b32_e32 v0, s8
-; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0
; SI-NEXT: v_and_b32_e32 v0, 1, v0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smulhi24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dword s6, s[4:5], 0x34
+; VI-NEXT: s_load_dword s7, s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smulhi24_i33:
diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index f497752994852..1d878a02d2525 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -846,8 +846,7 @@ define i64 @poison_should_freeze(i1 %cond1, i32 %val, i16 %val2, i64 %a, i64 %b)
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mov_b32_e32 v7, 0x5040100
-; GCN-NEXT: v_perm_b32 v2, v2, s4, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
index 8d0e00383d692..dcd7ed441fbae 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
@@ -3967,8 +3967,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5]
-; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -4071,8 +4071,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5]
-; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -4179,8 +4179,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5]
; GFX9-SDAG-NEXT: s_nop 1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
; GFX9-SDAG-NEXT: s_nop 1
; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -4287,8 +4287,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[2:3], v[6:7]
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4
; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4391,8 +4391,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[6:7]
; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -4506,8 +4506,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
index f15ecf014ab0b..515d36f9967a8 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
@@ -3966,8 +3966,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5]
-; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -4070,8 +4070,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5]
-; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -4178,8 +4178,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5]
; GFX9-SDAG-NEXT: s_nop 1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3]
; GFX9-SDAG-NEXT: s_nop 1
; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -4286,8 +4286,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[2:3], v[6:7]
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4
; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4390,8 +4390,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[6:7]
; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -4505,8 +4505,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
index e62165cb933c5..fba4bd516183c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
@@ -3843,8 +3843,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -3947,8 +3947,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -4055,8 +4055,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX9-SDAG-NEXT: s_nop 1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-SDAG-NEXT: s_nop 1
; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -4163,8 +4163,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[2:3], v[6:7]
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4
; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4267,8 +4267,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[6:7]
; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -4382,8 +4382,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
index 83ecaaa7e0846..6ffff5968d4e0 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
@@ -3579,8 +3579,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -3683,8 +3683,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -3791,8 +3791,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
; GFX9-SDAG-NEXT: s_nop 1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-SDAG-NEXT: s_nop 1
; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -3899,8 +3899,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7]
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4
; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4003,8 +4003,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7]
; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -4118,8 +4118,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0
; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/RISCV/pr66603.ll b/llvm/test/CodeGen/RISCV/pr66603.ll
index cfe8ceed12582..eb3d1a3b916e2 100644
--- a/llvm/test/CodeGen/RISCV/pr66603.ll
+++ b/llvm/test/CodeGen/RISCV/pr66603.ll
@@ -7,15 +7,11 @@ define i32 @PR66603(double %x) nounwind {
; RV32-LABEL: PR66603:
; RV32: # %bb.0:
; RV32-NEXT: fcvt.w.d a0, fa0, rtz
-; RV32-NEXT: slli a0, a0, 24
-; RV32-NEXT: srai a0, a0, 24
; RV32-NEXT: ret
;
; RV64-LABEL: PR66603:
; RV64: # %bb.0:
; RV64-NEXT: fcvt.l.d a0, fa0, rtz
-; RV64-NEXT: slli a0, a0, 56
-; RV64-NEXT: srai a0, a0, 56
; RV64-NEXT: ret
%as_i8 = fptosi double %x to i8
%frozen_i8 = freeze i8 %as_i8
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index 8a6a30318ae58..d18b64f12e527 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -74,40 +74,8 @@ define dso_local void @m() local_unnamed_addr #1 {
; CHECK-NEXT: rosbg %r2, %r3, 32, 61, 2
; CHECK-NEXT: rosbg %r2, %r5, 32, 62, 1
; CHECK-NEXT: or %r2, %r14
-; CHECK-NEXT: vlgvb %r4, %v0, 1
-; CHECK-NEXT: vlgvb %r3, %v0, 0
-; CHECK-NEXT: risbg %r3, %r3, 48, 176, 15
-; CHECK-NEXT: rosbg %r3, %r4, 49, 49, 14
-; CHECK-NEXT: vlgvb %r4, %v0, 2
-; CHECK-NEXT: rosbg %r3, %r4, 50, 50, 13
-; CHECK-NEXT: vlgvb %r4, %v0, 3
-; CHECK-NEXT: rosbg %r3, %r4, 51, 51, 12
-; CHECK-NEXT: vlgvb %r4, %v0, 4
-; CHECK-NEXT: rosbg %r3, %r4, 52, 52, 11
-; CHECK-NEXT: vlgvb %r4, %v0, 5
-; CHECK-NEXT: rosbg %r3, %r4, 53, 53, 10
-; CHECK-NEXT: vlgvb %r4, %v0, 6
-; CHECK-NEXT: rosbg %r3, %r4, 54, 54, 9
-; CHECK-NEXT: vlgvb %r4, %v0, 7
-; CHECK-NEXT: rosbg %r3, %r4, 55, 55, 8
-; CHECK-NEXT: vlgvb %r4, %v0, 8
-; CHECK-NEXT: rosbg %r3, %r4, 56, 56, 7
-; CHECK-NEXT: vlgvb %r4, %v0, 9
-; CHECK-NEXT: rosbg %r3, %r4, 57, 57, 6
-; CHECK-NEXT: vlgvb %r4, %v0, 10
-; CHECK-NEXT: rosbg %r3, %r4, 58, 58, 5
-; CHECK-NEXT: vlgvb %r4, %v0, 11
-; CHECK-NEXT: rosbg %r3, %r4, 59, 59, 4
-; CHECK-NEXT: vlgvb %r4, %v0, 12
-; CHECK-NEXT: rosbg %r3, %r4, 60, 60, 3
-; CHECK-NEXT: vlgvb %r4, %v0, 13
-; CHECK-NEXT: rosbg %r3, %r4, 61, 61, 2
-; CHECK-NEXT: vlgvb %r4, %v0, 14
-; CHECK-NEXT: rosbg %r3, %r4, 62, 62, 1
-; CHECK-NEXT: vlgvb %r4, %v0, 15
-; CHECK-NEXT: rosbg %r3, %r4, 63, 63, 0
; CHECK-NEXT: xilf %r3, 4294967295
-; CHECK-NEXT: or %r3, %r2
+; CHECK-NEXT: rosbg %r3, %r2, 48, 63, 0
; CHECK-NEXT: tmll %r3, 65535
; CHECK-NEXT: ipm %r2
; CHECK-NEXT: afi %r2, -268435456
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll
index 080c6c1a1efdc..27bc1e76a7ee2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll
@@ -2902,7 +2902,7 @@ define arm_aapcs_vfpcc <8 x half> @faddqr_v8f16_y(<8 x half> %x, half %y, i32 %n
; CHECK-NEXT: vctp.16 r0
; CHECK-NEXT: vdup.16 q1, r1
; CHECK-NEXT: vpst
-; CHECK-NEXT: vaddt.f16 q1, q0, r1
+; CHECK-NEXT: vaddt.f16 q1, q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
@@ -2978,7 +2978,7 @@ define arm_aapcs_vfpcc <8 x half> @fmulqr_v8f16_y(<8 x half> %x, half %y, i32 %n
; CHECK-NEXT: vctp.16 r0
; CHECK-NEXT: vdup.16 q1, r1
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmult.f16 q1, q0, r1
+; CHECK-NEXT: vmult.f16 q1, q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
entry:
diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index 03f283a57a217..c60d9a3ff17d3 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -6,8 +6,7 @@
define <8 x i16> @zext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_8x8mem_to_8x16:
; KNL: # %bb.0:
-; KNL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; KNL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; KNL-NEXT: vpsllw $15, %xmm0, %xmm0
; KNL-NEXT: vpsraw $15, %xmm0, %xmm0
; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -22,8 +21,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone
;
; AVX512DQNOBW-LABEL: zext_8x8mem_to_8x16:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX512DQNOBW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512DQNOBW-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX512DQNOBW-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -37,8 +35,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone
define <8 x i16> @sext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_8x8mem_to_8x16:
; KNL: # %bb.0:
-; KNL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; KNL-NEXT: vpmovsxbw %xmm1, %xmm1
+; KNL-NEXT: vpmovsxbw (%rdi), %xmm1
; KNL-NEXT: vpsllw $15, %xmm0, %xmm0
; KNL-NEXT: vpsraw $15, %xmm0, %xmm0
; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -53,8 +50,7 @@ define <8 x i16> @sext_8x8mem_to_8x16(ptr%i , <8 x i1> %mask) nounwind readnone
;
; AVX512DQNOBW-LABEL: sext_8x8mem_to_8x16:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQNOBW-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %xmm1
; AVX512DQNOBW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512DQNOBW-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX512DQNOBW-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -214,7 +210,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
@@ -237,7 +233,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
@@ -257,9 +253,9 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vpmovsxbw (%rdi), %ymm2
-; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm3
-; KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm2
+; KNL-NEXT: vpmovsxbw (%rdi), %ymm3
+; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
@@ -280,9 +276,9 @@ define <32 x i16> @sext_32x8mem_to_32x16(ptr%i , <32 x i1> %mask) nounwind readn
; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm2
-; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm3
-; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm2
+; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm3
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index e223765eb887b..7e3a902044615 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -864,12 +864,11 @@ define i32 @freeze_ssubo(i32 %a0, i32 %a1, i8 %a2, i8 %a3) nounwind {
; X86-LABEL: freeze_ssubo:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: addb {{[0-9]+}}(%esp), %dl
-; X86-NEXT: setb %cl
-; X86-NEXT: andl $1, %ecx
-; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: addb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: setb %dl
+; X86-NEXT: subl %edx, %eax
; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
@@ -896,12 +895,11 @@ define i32 @freeze_usubo(i32 %a0, i32 %a1, i8 %a2, i8 %a3) nounwind {
; X86-LABEL: freeze_usubo:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: addb {{[0-9]+}}(%esp), %dl
-; X86-NEXT: setb %cl
-; X86-NEXT: andl $1, %ecx
-; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: addb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: setb %dl
+; X86-NEXT: subl %edx, %eax
; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/freeze.ll b/llvm/test/CodeGen/X86/freeze.ll
index 38e3e23f7caac..a5549be92e793 100644
--- a/llvm/test/CodeGen/X86/freeze.ll
+++ b/llvm/test/CodeGen/X86/freeze.ll
@@ -96,8 +96,6 @@ define i32 @freeze_anonstruct() {
define i32 @freeze_anonstruct2() {
; X86ASM-LABEL: freeze_anonstruct2:
; X86ASM: # %bb.0:
-; X86ASM-NEXT: movzwl %ax, %eax
-; X86ASM-NEXT: addl %eax, %eax
; X86ASM-NEXT: retq
%y1 = freeze {i32, i16} undef
%v1 = extractvalue {i32, i16} %y1, 0
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index a4750b4cd4ad0..b1237b31660c2 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -863,20 +863,18 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
+; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: vpsubb %ymm0, %ymm5, %ymm5
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm4 & (zmm1 ^ zmm0))
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0
+; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg:
@@ -895,20 +893,18 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
+; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm4 & (zmm1 ^ zmm0))
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i8_signed_mem_reg:
@@ -953,19 +949,17 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm3
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5))
+; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: vpsubb %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm4 & (zmm3 ^ zmm2))
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
@@ -985,19 +979,17 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm7, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm4 & (zmm2 ^ zmm5))
+; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm5, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
+; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm4 & (zmm3 ^ zmm2))
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm3, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 2f8cd4d41af54..c9ef6b6c4cdb2 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -4451,8 +4451,8 @@ define i32 @PR39665_c_ray_select(<2 x double> %x, <2 x double> %y) {
; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k0
-; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: notb %al
; KNL-NEXT: testb $3, %al
; KNL-NEXT: movl $42, %ecx
; KNL-NEXT: movl $99, %eax
@@ -4463,8 +4463,8 @@ define i32 @PR39665_c_ray_select(<2 x double> %x, <2 x double> %y) {
; SKX-LABEL: PR39665_c_ray_select:
; SKX: # %bb.0:
; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0
-; SKX-NEXT: knotw %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
+; SKX-NEXT: notb %al
; SKX-NEXT: testb $3, %al
; SKX-NEXT: movl $42, %ecx
; SKX-NEXT: movl $99, %eax
diff --git a/llvm/test/CodeGen/X86/pr162812.ll b/llvm/test/CodeGen/X86/pr162812.ll
index cec093c3df743..02703b7e32cc6 100644
--- a/llvm/test/CodeGen/X86/pr162812.ll
+++ b/llvm/test/CodeGen/X86/pr162812.ll
@@ -34,32 +34,47 @@ define <32 x i8> @PR162812(<32 x i8> %a, <32 x i8> %mask) {
;
; SSE42-LABEL: PR162812:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa %xmm0, %xmm4
-; SSE42-NEXT: psrlw $2, %xmm2
-; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [8224,8224,8224,8224,8224,8224,8224,8224]
-; SSE42-NEXT: pand %xmm5, %xmm2
-; SSE42-NEXT: paddb %xmm2, %xmm2
-; SSE42-NEXT: paddb %xmm2, %xmm2
+; SSE42-NEXT: movdqa %xmm2, %xmm5
+; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: movdqa %xmm0, %xmm6
-; SSE42-NEXT: paddb %xmm0, %xmm6
-; SSE42-NEXT: movdqa %xmm2, %xmm0
-; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm4
-; SSE42-NEXT: psrlw $2, %xmm3
-; SSE42-NEXT: pand %xmm3, %xmm5
+; SSE42-NEXT: psllw $4, %xmm6
+; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; SSE42-NEXT: pand %xmm7, %xmm6
+; SSE42-NEXT: psrlw $2, %xmm5
+; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [8224,8224,8224,8224,8224,8224,8224,8224]
+; SSE42-NEXT: pand %xmm4, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm2
+; SSE42-NEXT: movdqa %xmm2, %xmm6
+; SSE42-NEXT: paddb %xmm2, %xmm6
; SSE42-NEXT: paddb %xmm5, %xmm5
; SSE42-NEXT: paddb %xmm5, %xmm5
-; SSE42-NEXT: movdqa %xmm1, %xmm2
-; SSE42-NEXT: paddb %xmm1, %xmm2
; SSE42-NEXT: movdqa %xmm5, %xmm0
-; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm2
+; SSE42-NEXT: movdqa %xmm1, %xmm5
+; SSE42-NEXT: psllw $4, %xmm5
+; SSE42-NEXT: pand %xmm7, %xmm5
+; SSE42-NEXT: psrlw $2, %xmm3
+; SSE42-NEXT: pand %xmm3, %xmm4
+; SSE42-NEXT: movdqa %xmm4, %xmm0
+; SSE42-NEXT: pblendvb %xmm0, %xmm5, %xmm1
+; SSE42-NEXT: movdqa %xmm1, %xmm3
+; SSE42-NEXT: paddb %xmm1, %xmm3
+; SSE42-NEXT: paddb %xmm4, %xmm4
+; SSE42-NEXT: paddb %xmm4, %xmm4
; SSE42-NEXT: movdqa %xmm4, %xmm0
+; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1
+; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: PR162812:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
@@ -67,9 +82,12 @@ define <32 x i8> @PR162812(<32 x i8> %a, <32 x i8> %mask) {
;
; AVX512-LABEL: PR162812:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX512-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
; AVX512-NEXT: vpsrlw $2, %ymm1, %ymm1
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
+; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
index b8e83da9cf361..ebb5e135eacd0 100644
--- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
@@ -281,7 +281,7 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X64-AVX2-NEXT: vpsllvd %ymm1, %ymm2, %ymm2
-; X64-AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; X64-AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; X64-AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; X64-AVX2-NEXT: vpsrlvd %ymm1, %ymm3, %ymm1
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index fce879585289a..ebd57cb941552 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -501,39 +501,39 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
; SSE3-NEXT: pextrw $0, %xmm1, %eax
; SSE3-NEXT: pextrw $1, %xmm1, %ecx
; SSE3-NEXT: pextrw $2, %xmm1, %edx
-; SSE3-NEXT: pextrw $3, %xmm1, %edi
-; SSE3-NEXT: pextrw $4, %xmm1, %r8d
-; SSE3-NEXT: pextrw $5, %xmm1, %r9d
-; SSE3-NEXT: pextrw $6, %xmm1, %r10d
-; SSE3-NEXT: pextrw $7, %xmm1, %esi
+; SSE3-NEXT: pextrw $3, %xmm1, %esi
+; SSE3-NEXT: pextrw $4, %xmm1, %edi
+; SSE3-NEXT: pextrw $5, %xmm1, %r8d
+; SSE3-NEXT: pextrw $6, %xmm1, %r9d
+; SSE3-NEXT: pextrw $7, %xmm1, %r10d
; SSE3-NEXT: movdqa %xmm2, -24(%rsp)
; SSE3-NEXT: andl $7, %eax
-; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
; SSE3-NEXT: andl $7, %ecx
-; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
; SSE3-NEXT: andl $7, %edx
-; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx
+; SSE3-NEXT: andl $7, %esi
; SSE3-NEXT: andl $7, %edi
-; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi
; SSE3-NEXT: andl $7, %r8d
-; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %r8d
; SSE3-NEXT: andl $7, %r9d
-; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %r9d
; SSE3-NEXT: andl $7, %r10d
; SSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d
-; SSE3-NEXT: andl $7, %esi
-; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi
-; SSE3-NEXT: movd %esi, %xmm1
-; SSE3-NEXT: movd %r10d, %xmm2
+; SSE3-NEXT: movd %r10d, %xmm1
+; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %r9d
+; SSE3-NEXT: movd %r9d, %xmm2
; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE3-NEXT: movd %r9d, %xmm1
-; SSE3-NEXT: movd %r8d, %xmm3
+; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %r8d
+; SSE3-NEXT: movd %r8d, %xmm1
+; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi
+; SSE3-NEXT: movd %edi, %xmm3
; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE3-NEXT: movd %edi, %xmm1
+; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi
+; SSE3-NEXT: movd %esi, %xmm1
+; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx
; SSE3-NEXT: movd %edx, %xmm2
; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
; SSE3-NEXT: movd %ecx, %xmm1
+; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
; SSE3-NEXT: movd %eax, %xmm4
; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
@@ -1102,9 +1102,8 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices)
; SSE3-NEXT: movq %xmm1, %rcx
; SSE3-NEXT: andl $1, %ecx
; SSE3-NEXT: movaps %xmm0, -24(%rsp)
-; SSE3-NEXT: movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
-; SSE3-NEXT: movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
; SSE3-NEXT: pandn %xmm0, %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
; SSE3-NEXT: retq
@@ -1127,9 +1126,8 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices)
; SSSE3-NEXT: movq %xmm1, %rcx
; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movaps %xmm0, -24(%rsp)
-; SSSE3-NEXT: movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
-; SSSE3-NEXT: movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
+; SSSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
; SSSE3-NEXT: pandn %xmm0, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
@@ -1302,16 +1300,16 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
; SSE3-NEXT: movd %xmm1, %esi
; SSE3-NEXT: movaps %xmm2, -24(%rsp)
; SSE3-NEXT: andl $3, %eax
-; SSE3-NEXT: movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
; SSE3-NEXT: andl $3, %ecx
-; SSE3-NEXT: movd -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
; SSE3-NEXT: andl $3, %edx
-; SSE3-NEXT: movd -24(%rsp,%rdx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero
; SSE3-NEXT: andl $3, %esi
-; SSE3-NEXT: movd -24(%rsp,%rsi,4), %xmm4 # xmm4 = mem[0],zero,zero,zero
-; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE3-NEXT: movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT: movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
+; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE3-NEXT: movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT: movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE3-NEXT: pandn %xmm1, %xmm0
; SSE3-NEXT: retq
;
@@ -1329,8 +1327,9 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSSE3-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSSE3-NEXT: por %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_zero_v4f32:
@@ -1341,8 +1340,9 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm1, %xmm0
+; SSE41-NEXT: pandn %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; XOP-LABEL: var_shuffle_zero_v4f32:
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index 1a6351524ffbd..5af992c2d05dd 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -289,7 +289,8 @@ define <8 x i32> @test_compress_v8i32(<8 x i32> %vec, <8 x i1> %mask, <8 x i32>
; AVX2-NEXT: cmpq $8, %r11
; AVX2-NEXT: cmovbl (%rsp,%rax,4), %ebx
; AVX2-NEXT: vmovss %xmm0, (%rsp)
-; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4)
+; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4)
; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4)
; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rsi,4)
; AVX2-NEXT: andl $7, %edi
@@ -363,7 +364,8 @@ define <8 x float> @test_compress_v8f32(<8 x float> %vec, <8 x i1> %mask, <8 x f
; AVX2-NEXT: vmovss %xmm0, (%rsp)
; AVX2-NEXT: vmovd %xmm3, %eax
; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4)
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rcx,4)
; AVX2-NEXT: vpextrd $1, %xmm3, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
@@ -1093,15 +1095,15 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vpextrb $1, %xmm1, %r13d
+; AVX2-NEXT: vpextrb $1, %xmm1, %ebp
; AVX2-NEXT: vmovd %xmm1, %esi
; AVX2-NEXT: movl %esi, %eax
; AVX2-NEXT: andb $1, %al
-; AVX2-NEXT: subb %r13b, %al
+; AVX2-NEXT: subb %bpl, %al
; AVX2-NEXT: vpextrb $2, %xmm1, %edx
; AVX2-NEXT: subb %dl, %al
-; AVX2-NEXT: vpextrb $3, %xmm1, %ebp
-; AVX2-NEXT: subb %bpl, %al
+; AVX2-NEXT: vpextrb $3, %xmm1, %r13d
+; AVX2-NEXT: subb %r13b, %al
; AVX2-NEXT: vpextrb $4, %xmm1, %r12d
; AVX2-NEXT: subb %r12b, %al
; AVX2-NEXT: vpextrb $5, %xmm1, %r15d
@@ -1135,17 +1137,17 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
; AVX2-NEXT: vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: andl $1, %esi
; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rsi)
-; AVX2-NEXT: andl $1, %r13d
-; AVX2-NEXT: addq %rsi, %r13
-; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%r13)
+; AVX2-NEXT: andl $1, %ebp
+; AVX2-NEXT: addq %rsi, %rbp
+; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%rbp)
; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %r13, %rdx
+; AVX2-NEXT: addq %rbp, %rdx
; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rdx)
-; AVX2-NEXT: andl $1, %ebp
-; AVX2-NEXT: addq %rdx, %rbp
-; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%rbp)
+; AVX2-NEXT: andl $1, %r13d
+; AVX2-NEXT: addq %rdx, %r13
+; AVX2-NEXT: vpextrb $4, %xmm0, -40(%rsp,%r13)
; AVX2-NEXT: andl $1, %r12d
-; AVX2-NEXT: addq %rbp, %r12
+; AVX2-NEXT: addq %r13, %r12
; AVX2-NEXT: andl $1, %r15d
; AVX2-NEXT: addq %r12, %r15
; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12
@@ -1693,30 +1695,30 @@ define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x
; AVX2-NEXT: vpextrw $4, %xmm1, %r13d
; AVX2-NEXT: andl $1, %r13d
; AVX2-NEXT: addq %r12, %r13
-; AVX2-NEXT: vpextrw $5, %xmm1, %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: addq %r13, %rdx
-; AVX2-NEXT: vpextrw $6, %xmm1, %ecx
+; AVX2-NEXT: vpextrw $5, %xmm1, %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vpextrw $7, %xmm1, %edi
-; AVX2-NEXT: andl $1, %edi
-; AVX2-NEXT: addq %rcx, %rdi
+; AVX2-NEXT: addq %r13, %rcx
+; AVX2-NEXT: vpextrw $6, %xmm1, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: vpextrw $7, %xmm1, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rax, %rdx
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: cmpq $16, %rdi
-; AVX2-NEXT: vpextrw $7, %xmm1, %eax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT: cmovbw (%rsp,%rsi,2), %ax
-; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: cmpq $16, %rdx
+; AVX2-NEXT: vpextrw $7, %xmm1, %esi
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX2-NEXT: cmovbw (%rsp,%rdi,2), %si
+; AVX2-NEXT: movl %esi, %edi
; AVX2-NEXT: vpextrw $0, %xmm0, (%rsp)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
; AVX2-NEXT: vpextrw $1, %xmm0, (%rsp,%rsi,2)
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; AVX2-NEXT: vpextrw $2, %xmm0, (%rsp,%rsi,2)
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; AVX2-NEXT: vpextrw $3, %xmm0, (%rsp,%rsi,2)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: vpextrw $4, %xmm0, (%rsp,%rax,2)
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT: vpextrw $4, %xmm0, (%rsp,%rsi,2)
; AVX2-NEXT: andl $15, %r8d
; AVX2-NEXT: vpextrw $5, %xmm0, (%rsp,%r8,2)
; AVX2-NEXT: andl $15, %r9d
@@ -1735,16 +1737,15 @@ define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x
; AVX2-NEXT: vpextrw $4, %xmm1, (%rsp,%r12,2)
; AVX2-NEXT: andl $15, %r13d
; AVX2-NEXT: vpextrw $5, %xmm1, (%rsp,%r13,2)
-; AVX2-NEXT: andl $15, %edx
-; AVX2-NEXT: vpextrw $6, %xmm1, (%rsp,%rdx,2)
; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpextrw $7, %xmm1, (%rsp,%rcx,2)
-; AVX2-NEXT: cmpq $15, %rdi
+; AVX2-NEXT: vpextrw $6, %xmm1, (%rsp,%rcx,2)
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpextrw $7, %xmm1, (%rsp,%rax,2)
+; AVX2-NEXT: cmpq $15, %rdx
; AVX2-NEXT: movl $15, %eax
-; AVX2-NEXT: cmovbq %rdi, %rax
+; AVX2-NEXT: cmovbq %rdx, %rax
; AVX2-NEXT: movl %eax, %eax
-; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; AVX2-NEXT: movw %cx, (%rsp,%rax,2)
+; AVX2-NEXT: movw %di, (%rsp,%rax,2)
; AVX2-NEXT: vmovaps (%rsp), %ymm0
; AVX2-NEXT: leaq -40(%rbp), %rsp
; AVX2-NEXT: popq %rbx
@@ -1788,135 +1789,141 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $96, %rsp
-; AVX2-NEXT: movl %r9d, %r11d
-; AVX2-NEXT: movl %r8d, %r10d
-; AVX2-NEXT: movl %ecx, %r9d
-; AVX2-NEXT: movl %edx, %r8d
+; AVX2-NEXT: subq $160, %rsp
+; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9
+; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8
+; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT: movzbl 360(%rbp), %eax
-; AVX2-NEXT: movzbl 352(%rbp), %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm4
+; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl 360(%rbp), %eax
+; AVX2-NEXT: movl 352(%rbp), %r10d
+; AVX2-NEXT: vmovd %r10d, %xmm4
; AVX2-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 368(%rbp), %eax
+; AVX2-NEXT: movl 368(%rbp), %eax
; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 376(%rbp), %eax
+; AVX2-NEXT: movl 376(%rbp), %eax
; AVX2-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 384(%rbp), %eax
+; AVX2-NEXT: movl 384(%rbp), %eax
; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 392(%rbp), %eax
+; AVX2-NEXT: movl 392(%rbp), %eax
; AVX2-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 400(%rbp), %eax
+; AVX2-NEXT: movl 400(%rbp), %eax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 408(%rbp), %eax
+; AVX2-NEXT: movl 408(%rbp), %eax
; AVX2-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 416(%rbp), %eax
+; AVX2-NEXT: movl 416(%rbp), %eax
; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 424(%rbp), %eax
+; AVX2-NEXT: movl 424(%rbp), %eax
; AVX2-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 432(%rbp), %eax
+; AVX2-NEXT: movl 432(%rbp), %eax
; AVX2-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 440(%rbp), %eax
+; AVX2-NEXT: movl 440(%rbp), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 448(%rbp), %eax
+; AVX2-NEXT: movl 448(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 456(%rbp), %eax
+; AVX2-NEXT: movl 456(%rbp), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 464(%rbp), %eax
+; AVX2-NEXT: movl 464(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 472(%rbp), %eax
+; AVX2-NEXT: movl 472(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; AVX2-NEXT: movzbl 224(%rbp), %eax
+; AVX2-NEXT: movl 224(%rbp), %eax
; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: movzbl 232(%rbp), %eax
+; AVX2-NEXT: movl 232(%rbp), %eax
; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 240(%rbp), %eax
+; AVX2-NEXT: movl 240(%rbp), %eax
; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 248(%rbp), %eax
+; AVX2-NEXT: movl 248(%rbp), %eax
; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 256(%rbp), %eax
+; AVX2-NEXT: movl 256(%rbp), %eax
; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 264(%rbp), %eax
+; AVX2-NEXT: movl 264(%rbp), %eax
; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 272(%rbp), %eax
+; AVX2-NEXT: movl 272(%rbp), %eax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 280(%rbp), %eax
+; AVX2-NEXT: movl 280(%rbp), %eax
; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 288(%rbp), %eax
+; AVX2-NEXT: movl 288(%rbp), %eax
; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 296(%rbp), %eax
+; AVX2-NEXT: movl 296(%rbp), %eax
; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 304(%rbp), %eax
+; AVX2-NEXT: movl 304(%rbp), %eax
; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 312(%rbp), %eax
+; AVX2-NEXT: movl 312(%rbp), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 320(%rbp), %eax
+; AVX2-NEXT: movl 320(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 328(%rbp), %eax
+; AVX2-NEXT: movl 328(%rbp), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 336(%rbp), %eax
+; AVX2-NEXT: movl 336(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 344(%rbp), %eax
+; AVX2-NEXT: movl 344(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
; AVX2-NEXT: vmovd %edi, %xmm5
; AVX2-NEXT: vpinsrb $1, %esi, %xmm5, %xmm5
; AVX2-NEXT: vpinsrb $2, %edx, %xmm5, %xmm5
-; AVX2-NEXT: vpinsrb $3, %r9d, %xmm5, %xmm5
-; AVX2-NEXT: vpinsrb $4, %r10d, %xmm5, %xmm5
-; AVX2-NEXT: vpinsrb $5, %r11d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 16(%rbp), %ebx
-; AVX2-NEXT: vpinsrb $6, %ebx, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 24(%rbp), %r14d
-; AVX2-NEXT: vpinsrb $7, %r14d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 32(%rbp), %r15d
-; AVX2-NEXT: vpinsrb $8, %r15d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 40(%rbp), %r12d
-; AVX2-NEXT: vpinsrb $9, %r12d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 48(%rbp), %r13d
-; AVX2-NEXT: vpinsrb $10, %r13d, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 56(%rbp), %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 64(%rbp), %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 72(%rbp), %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 80(%rbp), %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 88(%rbp), %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX2-NEXT: movzbl 96(%rbp), %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: movzbl 104(%rbp), %eax
+; AVX2-NEXT: vpinsrb $3, %ecx, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $4, %r8d, %xmm5, %xmm5
+; AVX2-NEXT: vpinsrb $5, %r9d, %xmm5, %xmm5
+; AVX2-NEXT: movl 16(%rbp), %esi
+; AVX2-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5
+; AVX2-NEXT: movl 24(%rbp), %edi
+; AVX2-NEXT: vpinsrb $7, %edi, %xmm5, %xmm5
+; AVX2-NEXT: movl 32(%rbp), %r8d
+; AVX2-NEXT: vpinsrb $8, %r8d, %xmm5, %xmm5
+; AVX2-NEXT: movl 40(%rbp), %r9d
+; AVX2-NEXT: vpinsrb $9, %r9d, %xmm5, %xmm5
+; AVX2-NEXT: movl 48(%rbp), %r10d
+; AVX2-NEXT: vpinsrb $10, %r10d, %xmm5, %xmm5
+; AVX2-NEXT: movl 56(%rbp), %r11d
+; AVX2-NEXT: vpinsrb $11, %r11d, %xmm5, %xmm5
+; AVX2-NEXT: movl 64(%rbp), %ebx
+; AVX2-NEXT: vpinsrb $12, %ebx, %xmm5, %xmm5
+; AVX2-NEXT: movl 72(%rbp), %r14d
+; AVX2-NEXT: vpinsrb $13, %r14d, %xmm5, %xmm5
+; AVX2-NEXT: movl 80(%rbp), %r15d
+; AVX2-NEXT: vpinsrb $14, %r15d, %xmm5, %xmm5
+; AVX2-NEXT: movl 88(%rbp), %r12d
+; AVX2-NEXT: vpinsrb $15, %r12d, %xmm5, %xmm5
+; AVX2-NEXT: movl 96(%rbp), %r13d
+; AVX2-NEXT: vmovd %r13d, %xmm6
+; AVX2-NEXT: movl 104(%rbp), %eax
; AVX2-NEXT: vpinsrb $1, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 112(%rbp), %eax
+; AVX2-NEXT: movl 112(%rbp), %eax
; AVX2-NEXT: vpinsrb $2, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 120(%rbp), %eax
+; AVX2-NEXT: movl 120(%rbp), %eax
; AVX2-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 128(%rbp), %eax
+; AVX2-NEXT: movl 128(%rbp), %eax
; AVX2-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 136(%rbp), %eax
+; AVX2-NEXT: movl 136(%rbp), %eax
; AVX2-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 144(%rbp), %eax
+; AVX2-NEXT: movl 144(%rbp), %eax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 152(%rbp), %eax
+; AVX2-NEXT: movl 152(%rbp), %eax
; AVX2-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 160(%rbp), %eax
+; AVX2-NEXT: movl 160(%rbp), %eax
; AVX2-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 168(%rbp), %eax
+; AVX2-NEXT: movl 168(%rbp), %eax
; AVX2-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 176(%rbp), %eax
+; AVX2-NEXT: movl 176(%rbp), %eax
; AVX2-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 184(%rbp), %eax
+; AVX2-NEXT: movl 184(%rbp), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 192(%rbp), %eax
+; AVX2-NEXT: movl 192(%rbp), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 200(%rbp), %eax
+; AVX2-NEXT: movl 200(%rbp), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 208(%rbp), %eax
+; AVX2-NEXT: movl 208(%rbp), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6
-; AVX2-NEXT: movzbl 216(%rbp), %eax
+; AVX2-NEXT: movl 216(%rbp), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm6, %xmm6
; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
@@ -1960,435 +1967,382 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX2-NEXT: vmovaps %ymm2, (%rsp)
; AVX2-NEXT: movzbl %al, %eax
; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: movzbl (%rsp,%rax), %edx
+; AVX2-NEXT: movzbl (%rsp,%rax), %eax
+; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp)
-; AVX2-NEXT: andl $1, %edi
-; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rdi)
-; AVX2-NEXT: andl $1, %esi
-; AVX2-NEXT: addq %rdi, %rsi
-; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rsi)
-; AVX2-NEXT: andl $1, %r8d
-; AVX2-NEXT: addq %rsi, %r8
-; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r8)
-; AVX2-NEXT: andl $1, %r9d
-; AVX2-NEXT: addq %r8, %r9
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r9)
-; AVX2-NEXT: andl $1, %r10d
-; AVX2-NEXT: addq %r9, %r10
-; AVX2-NEXT: movl %r10d, %eax
-; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: andl $1, %r11d
-; AVX2-NEXT: addq %r10, %r11
-; AVX2-NEXT: movzbl %bl, %eax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %r11, %rax
-; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11
-; AVX2-NEXT: andl $63, %r11d
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%r11)
-; AVX2-NEXT: movzbl %r14b, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl %r15b, %eax
+; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl %r12b, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl %r13b, %eax
+; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 56(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 64(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 72(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 80(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 88(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: addq %rax, %rsi
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 96(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: addq %rsi, %rdi
+; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi
+; AVX2-NEXT: andl $63, %esi
+; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rsi)
+; AVX2-NEXT: andl $1, %r8d
+; AVX2-NEXT: addq %rdi, %r8
+; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX2-NEXT: andl $63, %edi
+; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdi)
+; AVX2-NEXT: andl $1, %r9d
+; AVX2-NEXT: addq %r8, %r9
+; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8
+; AVX2-NEXT: andl $63, %r8d
+; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%r8)
+; AVX2-NEXT: andl $1, %r10d
+; AVX2-NEXT: addq %r9, %r10
+; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9
+; AVX2-NEXT: andl $63, %r9d
+; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%r9)
+; AVX2-NEXT: andl $1, %r11d
+; AVX2-NEXT: addq %r10, %r11
+; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10
+; AVX2-NEXT: andl $63, %r10d
+; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%r10)
+; AVX2-NEXT: andl $1, %ebx
+; AVX2-NEXT: addq %r11, %rbx
+; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11
+; AVX2-NEXT: andl $63, %r11d
+; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%r11)
+; AVX2-NEXT: andl $1, %r14d
+; AVX2-NEXT: addq %rbx, %r14
+; AVX2-NEXT: # kill: def $ebx killed $ebx killed $rbx def $rbx
+; AVX2-NEXT: andl $63, %ebx
+; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rbx)
+; AVX2-NEXT: andl $1, %r15d
+; AVX2-NEXT: addq %r14, %r15
+; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14
+; AVX2-NEXT: andl $63, %r14d
+; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%r14)
+; AVX2-NEXT: andl $1, %r12d
+; AVX2-NEXT: addq %r15, %r12
+; AVX2-NEXT: # kill: def $r15d killed $r15d killed $r15 def $r15
+; AVX2-NEXT: andl $63, %r15d
+; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%r15)
+; AVX2-NEXT: andl $1, %r13d
+; AVX2-NEXT: addq %r12, %r13
+; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12
+; AVX2-NEXT: andl $63, %r12d
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 104(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%r12)
+; AVX2-NEXT: movl 104(%rbp), %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: addq %r13, %rax
+; AVX2-NEXT: # kill: def $r13d killed $r13d killed $r13 def $r13
+; AVX2-NEXT: andl $63, %r13d
+; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%r13)
+; AVX2-NEXT: movl 112(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rax, %rdx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $63, %eax
-; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 112(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 120(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 120(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 128(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 136(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 128(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 136(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 144(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 152(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 144(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 152(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 160(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 168(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 160(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 168(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 176(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 184(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 176(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 184(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 192(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 200(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 192(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 200(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 208(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 216(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 208(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 216(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 224(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 232(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 224(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 232(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $1, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 240(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 248(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 240(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 248(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $3, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 256(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 264(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: movl 256(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 264(%rbp), %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $5, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 272(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 280(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 272(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 280(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 288(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 296(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 288(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 296(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $9, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 304(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 312(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 304(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 312(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $11, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 320(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 328(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 320(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 328(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $13, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 336(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 344(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 336(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rax)
+; AVX2-NEXT: movl 344(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm1, (%rsp,%rax)
-; AVX2-NEXT: movzbl 352(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: movl 352(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 360(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 360(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 368(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 376(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 368(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 376(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 384(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 392(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 384(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 392(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 400(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 408(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 400(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 408(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 416(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 424(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 416(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 424(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 432(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 440(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 432(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 440(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 448(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 456(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 448(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 456(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax)
-; AVX2-NEXT: movzbl 464(%rbp), %eax
-; AVX2-NEXT: movzbl %al, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rcx)
-; AVX2-NEXT: movzbl 472(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 464(%rbp), %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $63, %eax
+; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax)
+; AVX2-NEXT: movl 472(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: andl $63, %eax
; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax)
; AVX2-NEXT: vpextrb $15, %xmm0, %eax
; AVX2-NEXT: cmpq $64, %rcx
-; AVX2-NEXT: cmovbl %edx, %eax
+; AVX2-NEXT: cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
; AVX2-NEXT: cmpq $63, %rcx
-; AVX2-NEXT: movl $63, %edx
-; AVX2-NEXT: cmovbq %rcx, %rdx
-; AVX2-NEXT: movb %al, (%rsp,%rdx)
+; AVX2-NEXT: movq %rcx, %rdx
+; AVX2-NEXT: movl $63, %ecx
+; AVX2-NEXT: cmovbq %rdx, %rcx
+; AVX2-NEXT: movb %al, (%rsp,%rcx)
; AVX2-NEXT: vmovaps (%rsp), %ymm0
; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
; AVX2-NEXT: leaq -40(%rbp), %rsp
@@ -3323,10 +3277,10 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $288, %rsp # imm = 0x120
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9
; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8
+; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
; AVX2-NEXT: movq %rdi, %rax
; AVX2-NEXT: vmovss %xmm0, (%rsp)
@@ -3344,413 +3298,355 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX2-NEXT: vmovss %xmm0, (%rsp,%r8,4)
; AVX2-NEXT: andl $1, %r9d
; AVX2-NEXT: addl %r8d, %r9d
-; AVX2-NEXT: movzbl 16(%rbp), %ecx
; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%r9,4)
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 16(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %r9d, %ecx
-; AVX2-NEXT: movzbl 24(%rbp), %edx
; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 24(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: movzbl 32(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 32(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 40(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 40(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vmovss %xmm1, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 48(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 48(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm1, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 56(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 56(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm1, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 64(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 64(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm1, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 72(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 72(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 80(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 80(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 88(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 88(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 96(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 96(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 104(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 104(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vmovss %xmm2, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 112(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 112(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm2, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 120(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 120(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm2, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 128(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 128(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm2, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 136(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 136(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm0
; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 144(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 144(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 152(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 152(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 160(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 160(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 168(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 168(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vmovss %xmm3, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 176(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 176(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm3, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 184(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 184(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm3, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 192(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 192(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm3, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 200(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 200(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm0
; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 208(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 208(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 216(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 216(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 224(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 224(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 232(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 232(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vmovss %xmm4, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 240(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 240(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm4, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 248(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 248(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm4, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 256(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 256(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm4, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 264(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 264(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm0
; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 272(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 272(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 280(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 280(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 288(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 288(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 296(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 296(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vmovss %xmm5, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 304(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 304(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm5, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 312(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 312(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm5, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 320(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 320(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm5, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 328(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 328(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm0
; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 336(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 336(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 344(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 344(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 352(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 352(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 360(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 360(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vmovss %xmm6, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 368(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 368(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm6, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 376(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 376(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm6, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 384(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 384(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm6, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 392(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 392(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm0
; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 400(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 400(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 408(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 408(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 416(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 416(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 424(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 424(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vmovss %xmm7, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 432(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 432(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm7, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 440(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 440(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm7, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 448(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 448(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $3, %xmm7, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 456(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 456(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm0
; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4)
-; AVX2-NEXT: movzbl 464(%rbp), %ecx
-; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: movl 464(%rbp), %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addl %edx, %ecx
-; AVX2-NEXT: # kill: def $edx killed $edx def $rdx
+; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $63, %edx
; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4)
-; AVX2-NEXT: movzbl 472(%rbp), %edx
-; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: movl 472(%rbp), %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addl %ecx, %edx
-; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4)
; AVX2-NEXT: andl $63, %edx
@@ -4748,6 +4644,17 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind {
; AVX2-NEXT: vpextrb $3, %xmm1, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
+; AVX2-NEXT: vpextrb $4, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT: vpextrb $5, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT: vpextrb $6, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT: vpextrb $7, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT: vpextrb $8, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT: vpextrb $9, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT: vpextrb $10, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT: vpextrb $11, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT: vpextrb $12, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT: vpextrb $13, %xmm0, -24(%rsp,%rcx)
+; AVX2-NEXT: vpextrb $14, %xmm0, -24(%rsp,%rcx)
; AVX2-NEXT: vpextrb $15, %xmm0, -24(%rsp,%rcx)
; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index fce622a99bb6a..560d5be284f15 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -12,6 +12,9 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
; X64-NO-BMI2-NEXT: movzbl (%rdi), %eax
+; X64-NO-BMI2-NEXT: movd %eax, %xmm0
+; X64-NO-BMI2-NEXT: movd %xmm0, %eax
+; X64-NO-BMI2-NEXT: movzwl %ax, %eax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrl %cl, %eax
@@ -21,6 +24,9 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
; X64-BMI2-NEXT: movzbl (%rdi), %eax
+; X64-BMI2-NEXT: movd %eax, %xmm0
+; X64-BMI2-NEXT: movd %xmm0, %eax
+; X64-BMI2-NEXT: movzwl %ax, %eax
; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxl %esi, %eax, %eax
; X64-BMI2-NEXT: movb %al, (%rdx)
@@ -28,14 +34,17 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
; X86-NO-BMI2: # %bb.0:
-; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT: movzbl (%eax), %eax
+; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NEXT: movzbl (%edx), %edx
+; X86-NO-BMI2-NEXT: movd %edx, %xmm0
+; X86-NO-BMI2-NEXT: movd %xmm0, %edx
+; X86-NO-BMI2-NEXT: movzwl %dx, %edx
; X86-NO-BMI2-NEXT: shll $3, %ecx
; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NEXT: movb %al, (%edx)
+; X86-NO-BMI2-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NEXT: movb %dl, (%eax)
; X86-NO-BMI2-NEXT: retl
;
; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
@@ -44,6 +53,9 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT: movzbl (%edx), %edx
+; X86-BMI2-NEXT: movd %edx, %xmm0
+; X86-BMI2-NEXT: movd %xmm0, %edx
+; X86-BMI2-NEXT: movzwl %dx, %edx
; X86-BMI2-NEXT: shll $3, %ecx
; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT: movb %cl, (%eax)
@@ -65,6 +77,10 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
; X64-NO-BMI2-NEXT: movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT: movd %eax, %xmm0
+; X64-NO-BMI2-NEXT: pxor %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NO-BMI2-NEXT: movd %xmm0, %eax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrl %cl, %eax
@@ -74,6 +90,10 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
; X64-BMI2-NEXT: movzwl (%rdi), %eax
+; X64-BMI2-NEXT: movd %eax, %xmm0
+; X64-BMI2-NEXT: pxor %xmm1, %xmm1
+; X64-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-BMI2-NEXT: movd %xmm0, %eax
; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxl %esi, %eax, %eax
; X64-BMI2-NEXT: movb %al, (%rdx)
@@ -81,14 +101,18 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X86-NO-BMI2: # %bb.0:
-; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT: movzwl (%eax), %eax
+; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NEXT: movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT: movd %edx, %xmm0
+; X86-NO-BMI2-NEXT: pxor %xmm1, %xmm1
+; X86-NO-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NO-BMI2-NEXT: movd %xmm0, %edx
; X86-NO-BMI2-NEXT: shll $3, %ecx
; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NEXT: movb %al, (%edx)
+; X86-NO-BMI2-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NEXT: movb %dl, (%eax)
; X86-NO-BMI2-NEXT: retl
;
; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -97,6 +121,10 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT: movzwl (%edx), %edx
+; X86-BMI2-NEXT: movd %edx, %xmm0
+; X86-BMI2-NEXT: pxor %xmm1, %xmm1
+; X86-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-BMI2-NEXT: movd %xmm0, %edx
; X86-BMI2-NEXT: shll $3, %ecx
; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT: movb %cl, (%eax)
@@ -119,6 +147,10 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
; X64-NO-BMI2-NEXT: movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT: movd %eax, %xmm0
+; X64-NO-BMI2-NEXT: pxor %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NO-BMI2-NEXT: movd %xmm0, %eax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrl %cl, %eax
@@ -128,6 +160,10 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
; X64-BMI2-NEXT: movzwl (%rdi), %eax
+; X64-BMI2-NEXT: movd %eax, %xmm0
+; X64-BMI2-NEXT: pxor %xmm1, %xmm1
+; X64-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-BMI2-NEXT: movd %xmm0, %eax
; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxl %esi, %eax, %eax
; X64-BMI2-NEXT: movw %ax, (%rdx)
@@ -139,6 +175,10 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT: movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT: movd %edx, %xmm0
+; X86-NO-BMI2-NEXT: pxor %xmm1, %xmm1
+; X86-NO-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NO-BMI2-NEXT: movd %xmm0, %edx
; X86-NO-BMI2-NEXT: shll $3, %ecx
; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NEXT: shrl %cl, %edx
@@ -151,6 +191,10 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT: movzwl (%edx), %edx
+; X86-BMI2-NEXT: movd %edx, %xmm0
+; X86-BMI2-NEXT: pxor %xmm1, %xmm1
+; X86-BMI2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-BMI2-NEXT: movd %xmm0, %edx
; X86-BMI2-NEXT: shll $3, %ecx
; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT: movw %cx, (%eax)
@@ -171,8 +215,9 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movl (%rdi), %eax
+; X64-NO-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movb %al, (%rdx)
@@ -180,8 +225,9 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movl (%rdi), %eax
+; X64-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: movq %xmm0, %rax
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movb %al, (%rdx)
; X64-BMI2-NEXT: retq
@@ -248,8 +294,9 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movl (%rdi), %eax
+; X64-NO-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
@@ -257,8 +304,9 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movl (%rdi), %eax
+; X64-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: movq %xmm0, %rax
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movw %ax, (%rdx)
; X64-BMI2-NEXT: retq
@@ -324,8 +372,9 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movl (%rdi), %eax
+; X64-NO-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
@@ -333,8 +382,9 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movl (%rdi), %eax
+; X64-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: movq %xmm0, %rax
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movl %eax, (%rdx)
; X64-BMI2-NEXT: retq
@@ -400,38 +450,73 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (%rax,%rax), %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: orl %edi, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT: movb %sil, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movb %al, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
-; X64-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-SHLD: # %bb.0:
-; X64-SHLD-NEXT: movq %rsi, %rcx
-; X64-SHLD-NEXT: movq (%rdi), %rax
-; X64-SHLD-NEXT: shll $3, %ecx
-; X64-SHLD-NEXT: xorl %esi, %esi
-; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-SHLD-NEXT: testb $64, %cl
-; X64-SHLD-NEXT: cmovneq %rsi, %rax
-; X64-SHLD-NEXT: movb %al, (%rdx)
-; X64-SHLD-NEXT: retq
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movb %sil, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (%rcx,%rcx), %r8d
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movb %sil, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD: # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
@@ -439,12 +524,11 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
@@ -469,12 +553,11 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-SHLD-NEXT: movl %ecx, %edx
; X86-SHLD-NEXT: shrb $3, %dl
; X86-SHLD-NEXT: andb $12, %dl
@@ -495,12 +578,11 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
@@ -532,38 +614,73 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (%rax,%rax), %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: orl %edi, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT: movw %si, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movw %ax, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
-; X64-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-SHLD: # %bb.0:
-; X64-SHLD-NEXT: movq %rsi, %rcx
-; X64-SHLD-NEXT: movq (%rdi), %rax
-; X64-SHLD-NEXT: shll $3, %ecx
-; X64-SHLD-NEXT: xorl %esi, %esi
-; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-SHLD-NEXT: testb $64, %cl
-; X64-SHLD-NEXT: cmovneq %rsi, %rax
-; X64-SHLD-NEXT: movw %ax, (%rdx)
-; X64-SHLD-NEXT: retq
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (%rcx,%rcx), %r8d
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD: # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
@@ -571,12 +688,11 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
@@ -601,12 +717,11 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-SHLD-NEXT: movl %ecx, %edx
; X86-SHLD-NEXT: shrb $3, %dl
; X86-SHLD-NEXT: andb $12, %dl
@@ -627,12 +742,11 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
@@ -663,38 +777,73 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (%rax,%rax), %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: orl %edi, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
-; X64-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-SHLD: # %bb.0:
-; X64-SHLD-NEXT: movq %rsi, %rcx
-; X64-SHLD-NEXT: movq (%rdi), %rax
-; X64-SHLD-NEXT: shll $3, %ecx
-; X64-SHLD-NEXT: xorl %esi, %esi
-; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-SHLD-NEXT: testb $64, %cl
-; X64-SHLD-NEXT: cmovneq %rsi, %rax
-; X64-SHLD-NEXT: movl %eax, (%rdx)
-; X64-SHLD-NEXT: retq
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (%rcx,%rcx), %r8d
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD: # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
@@ -702,12 +851,11 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
@@ -732,12 +880,11 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-SHLD-NEXT: movl %ecx, %edx
; X86-SHLD-NEXT: shrb $3, %dl
; X86-SHLD-NEXT: andb $12, %dl
@@ -758,12 +905,11 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
@@ -794,38 +940,73 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
-; X64-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-SHLD: # %bb.0:
-; X64-SHLD-NEXT: movq %rsi, %rcx
-; X64-SHLD-NEXT: movq (%rdi), %rax
-; X64-SHLD-NEXT: shll $3, %ecx
-; X64-SHLD-NEXT: xorl %esi, %esi
-; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-SHLD-NEXT: testb $64, %cl
-; X64-SHLD-NEXT: cmovneq %rsi, %rax
-; X64-SHLD-NEXT: movq %rax, (%rdx)
-; X64-SHLD-NEXT: retq
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD: # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
@@ -836,12 +1017,11 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
@@ -881,12 +1061,11 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-SHLD-NEXT: movl %ecx, %edx
; X86-SHLD-NEXT: shrb $3, %dl
; X86-SHLD-NEXT: andb $12, %dl
@@ -916,12 +1095,11 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
@@ -964,13 +1142,13 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: xorps %xmm0, %xmm0
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movl %ecx, %eax
; X64-NO-BMI2-NEXT: shrb $6, %al
; X64-NO-BMI2-NEXT: movzbl %al, %eax
@@ -982,13 +1160,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
;
; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-BMI2-NEXT: xorps %xmm0, %xmm0
; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movups (%rdi), %xmm1
; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: movl %esi, %eax
; X64-BMI2-NEXT: shrb $6, %al
; X64-BMI2-NEXT: movzbl %al, %eax
@@ -1003,13 +1181,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
@@ -1033,13 +1211,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movups (%edx), %xmm0
-; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-SHLD-NEXT: shll $3, %ecx
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movups (%edx), %xmm1
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-SHLD-NEXT: movl %ecx, %edx
; X86-SHLD-NEXT: shrb $5, %dl
; X86-SHLD-NEXT: movzbl %dl, %edx
@@ -1059,13 +1237,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
@@ -1096,13 +1274,13 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: xorps %xmm0, %xmm0
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movl %ecx, %eax
; X64-NO-BMI2-NEXT: shrb $6, %al
; X64-NO-BMI2-NEXT: movzbl %al, %eax
@@ -1120,13 +1298,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
;
; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-BMI2-NEXT: xorps %xmm0, %xmm0
; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movups (%rdi), %xmm1
; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: movl %esi, %eax
; X64-BMI2-NEXT: shrb $6, %al
; X64-BMI2-NEXT: movzbl %al, %eax
@@ -1148,13 +1326,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
@@ -1178,13 +1356,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movups (%edx), %xmm0
-; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-SHLD-NEXT: shll $3, %ecx
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movups (%edx), %xmm1
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-SHLD-NEXT: movl %ecx, %edx
; X86-SHLD-NEXT: shrb $5, %dl
; X86-SHLD-NEXT: movzbl %dl, %edx
@@ -1204,13 +1382,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
@@ -1240,13 +1418,13 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: xorps %xmm0, %xmm0
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movl %ecx, %eax
; X64-NO-BMI2-NEXT: shrb $6, %al
; X64-NO-BMI2-NEXT: movzbl %al, %eax
@@ -1264,13 +1442,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
;
; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-BMI2-NEXT: xorps %xmm0, %xmm0
; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movups (%rdi), %xmm1
; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: movl %esi, %eax
; X64-BMI2-NEXT: shrb $6, %al
; X64-BMI2-NEXT: movzbl %al, %eax
@@ -1292,13 +1470,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
@@ -1322,13 +1500,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movups (%edx), %xmm0
-; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-SHLD-NEXT: shll $3, %ecx
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movups (%edx), %xmm1
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-SHLD-NEXT: movl %ecx, %edx
; X86-SHLD-NEXT: shrb $5, %dl
; X86-SHLD-NEXT: movzbl %dl, %edx
@@ -1348,13 +1526,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
@@ -1384,13 +1562,13 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
@@ -1407,13 +1585,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
;
; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-SHLD: # %bb.0:
-; X64-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-SHLD-NEXT: xorps %xmm0, %xmm0
; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
-; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movups (%rdi), %xmm1
; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-SHLD-NEXT: movl %ecx, %eax
; X64-SHLD-NEXT: shrb $6, %al
; X64-SHLD-NEXT: movzbl %al, %eax
@@ -1426,13 +1604,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
@@ -1455,13 +1633,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
@@ -1500,13 +1678,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movups (%edx), %xmm0
-; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-SHLD-NEXT: shll $3, %ecx
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movups (%edx), %xmm1
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-SHLD-NEXT: movl %ecx, %edx
; X86-SHLD-NEXT: shrb $5, %dl
; X86-SHLD-NEXT: movzbl %dl, %edx
@@ -1535,13 +1713,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
@@ -1583,13 +1761,13 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1
; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
@@ -1616,13 +1794,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %cl
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %esi
@@ -1644,13 +1822,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
@@ -1673,15 +1851,15 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm1
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
@@ -1707,13 +1885,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
@@ -1773,13 +1951,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
; X86-SHLD-NEXT: subl $92, %esp
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT: movups (%eax), %xmm0
-; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-SHLD-NEXT: shll $3, %ecx
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movups (%eax), %xmm1
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movl %ecx, %eax
; X86-SHLD-NEXT: shrb $5, %al
; X86-SHLD-NEXT: movzbl %al, %ebx
@@ -1816,13 +1994,13 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $92, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
@@ -1881,17 +2059,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
; X64-NO-BMI2-NEXT: pushq %rax
-; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: andl $56, %ecx
; X64-NO-BMI2-NEXT: andl $56, %esi
@@ -1910,17 +2088,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
; X64-BMI2-NEXT: pushq %rax
-; X64-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: xorps %xmm0, %xmm0
+; X64-BMI2-NEXT: movups (%rdi), %xmm1
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm2
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: leal (,%rsi,8), %eax
; X64-BMI2-NEXT: andl $56, %eax
; X64-BMI2-NEXT: andl $56, %esi
@@ -1942,17 +2120,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
@@ -1975,17 +2153,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-SHLD-NEXT: movups 16(%ecx), %xmm2
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-SHLD-NEXT: leal (,%edx,8), %ecx
; X86-SHLD-NEXT: andl $60, %edx
; X86-SHLD-NEXT: movl (%esp,%edx), %ebx
@@ -2004,17 +2182,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
@@ -2045,17 +2223,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
; X64-NO-BMI2-NEXT: pushq %rax
-; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: andl $56, %ecx
; X64-NO-BMI2-NEXT: andl $56, %esi
@@ -2074,17 +2252,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
; X64-BMI2-NEXT: pushq %rax
-; X64-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: xorps %xmm0, %xmm0
+; X64-BMI2-NEXT: movups (%rdi), %xmm1
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm2
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: leal (,%rsi,8), %eax
; X64-BMI2-NEXT: andl $56, %eax
; X64-BMI2-NEXT: andl $56, %esi
@@ -2106,17 +2284,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
@@ -2139,17 +2317,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-SHLD-NEXT: movups 16(%ecx), %xmm2
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-SHLD-NEXT: leal (,%edx,8), %ecx
; X86-SHLD-NEXT: andl $60, %edx
; X86-SHLD-NEXT: movl (%esp,%edx), %esi
@@ -2168,17 +2346,17 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
@@ -2208,17 +2386,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
; X64-NO-BMI2-NEXT: pushq %rax
-; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: andl $56, %ecx
; X64-NO-BMI2-NEXT: andl $56, %esi
@@ -2237,17 +2415,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
; X64-BMI2-NEXT: pushq %rax
-; X64-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: xorps %xmm0, %xmm0
+; X64-BMI2-NEXT: movups (%rdi), %xmm1
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm2
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: leal (,%rsi,8), %eax
; X64-BMI2-NEXT: andl $56, %eax
; X64-BMI2-NEXT: andl $56, %esi
@@ -2269,17 +2447,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
@@ -2302,17 +2480,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-SHLD-NEXT: movups 16(%ecx), %xmm2
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-SHLD-NEXT: leal (,%edx,8), %ecx
; X86-SHLD-NEXT: andl $60, %edx
; X86-SHLD-NEXT: movl (%esp,%edx), %esi
@@ -2331,17 +2509,17 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
@@ -2371,17 +2549,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %rax
@@ -2399,17 +2577,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-SHLD: # %bb.0:
; X64-SHLD-NEXT: pushq %rax
-; X64-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-SHLD-NEXT: movups 16(%rdi), %xmm2
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
; X64-SHLD-NEXT: andl $56, %esi
; X64-SHLD-NEXT: movq -128(%rsp,%rsi), %rax
@@ -2423,17 +2601,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
@@ -2455,17 +2633,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-NO-BMI2-NO-SHLD-NEXT: subl $140, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %esi
@@ -2506,17 +2684,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movups (%edx), %xmm0
-; X86-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-SHLD-NEXT: movups (%edx), %xmm1
+; X86-SHLD-NEXT: movups 16(%edx), %xmm2
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-SHLD-NEXT: movl %ecx, %esi
; X86-SHLD-NEXT: andl $60, %esi
; X86-SHLD-NEXT: movl 8(%esp,%esi), %edi
@@ -2545,17 +2723,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
@@ -2595,17 +2773,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi
@@ -2634,17 +2812,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %edi
@@ -2670,17 +2848,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx
@@ -2706,17 +2884,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
@@ -2745,17 +2923,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X86-NO-BMI2-NO-SHLD-NEXT: subl $156, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx
@@ -2816,17 +2994,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X86-SHLD-NEXT: subl $156, %esp
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT: movups (%eax), %xmm0
-; X86-SHLD-NEXT: movups 16(%eax), %xmm1
-; X86-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-SHLD-NEXT: movups (%eax), %xmm1
+; X86-SHLD-NEXT: movups 16(%eax), %xmm2
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movl %ecx, %edi
; X86-SHLD-NEXT: andl $60, %edi
; X86-SHLD-NEXT: movl 24(%esp,%edi), %esi
@@ -2864,17 +3042,17 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
@@ -2931,17 +3109,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi
@@ -2993,17 +3171,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %edi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
@@ -3046,17 +3224,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx
@@ -3097,17 +3275,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx
@@ -3146,17 +3324,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X86-NO-BMI2-NO-SHLD-NEXT: subl $172, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebx
@@ -3257,17 +3435,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X86-SHLD-NEXT: subl $156, %esp
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT: movups (%eax), %xmm0
-; X86-SHLD-NEXT: movups 16(%eax), %xmm1
-; X86-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-SHLD-NEXT: movups (%eax), %xmm1
+; X86-SHLD-NEXT: movups 16(%eax), %xmm2
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movl %ecx, %edi
; X86-SHLD-NEXT: andl $60, %edi
; X86-SHLD-NEXT: movl 24(%esp,%edi), %edx
@@ -3324,17 +3502,17 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 8d36eef952a2b..c4c87086dc359 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -168,8 +168,8 @@ define void @load_2byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movb %al, (%rdx)
@@ -188,17 +188,15 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -215,13 +213,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
@@ -236,14 +232,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
@@ -260,23 +253,19 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %dl, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -292,8 +281,8 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
@@ -312,17 +301,15 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -339,18 +326,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
;
@@ -360,14 +345,11 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
@@ -386,18 +368,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
@@ -413,8 +393,8 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
@@ -433,17 +413,15 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -460,18 +438,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
;
@@ -481,14 +457,11 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
@@ -507,18 +480,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
@@ -536,8 +507,8 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -557,8 +528,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -571,8 +542,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -591,8 +562,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -610,8 +581,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
@@ -639,8 +610,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movups (%edx), %xmm0
; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
; X86-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm0, (%esp)
@@ -664,8 +635,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
@@ -698,8 +669,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -719,8 +690,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -733,8 +704,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -753,8 +724,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -772,8 +743,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
@@ -801,8 +772,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movups (%edx), %xmm0
; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
; X86-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm0, (%esp)
@@ -826,8 +797,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
@@ -859,8 +830,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -880,8 +851,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -894,8 +865,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -914,8 +885,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -933,8 +904,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
@@ -962,8 +933,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movups (%edx), %xmm0
; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
; X86-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm0, (%esp)
@@ -987,8 +958,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
@@ -1020,8 +991,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -1041,8 +1012,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -1055,8 +1026,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -1075,8 +1046,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -1097,8 +1068,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
@@ -1141,8 +1112,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT: movups (%edx), %xmm0
; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
; X86-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm0, (%esp)
@@ -1175,8 +1146,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
@@ -1222,9 +1193,9 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1241,9 +1212,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: movups (%rdi), %xmm0
; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: xorps %xmm2, %xmm2
; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1263,9 +1234,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1294,9 +1265,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: movups (%edx), %xmm0
; X86-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1321,9 +1292,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1357,9 +1328,9 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1382,9 +1353,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: movups (%rdi), %xmm0
; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: xorps %xmm2, %xmm2
; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1411,9 +1382,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1442,9 +1413,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: movups (%edx), %xmm0
; X86-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1469,9 +1440,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1504,9 +1475,9 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1529,9 +1500,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: movups (%rdi), %xmm0
; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: xorps %xmm2, %xmm2
; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1558,9 +1529,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1589,9 +1560,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: movups (%edx), %xmm0
; X86-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1616,9 +1587,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1651,9 +1622,9 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1675,9 +1646,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
; X64-SHLD-NEXT: movups (%rdi), %xmm0
; X64-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
; X64-SHLD-NEXT: xorps %xmm2, %xmm2
; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1695,9 +1666,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1725,9 +1696,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1771,9 +1742,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: movups (%edx), %xmm0
; X86-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1807,9 +1778,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -1854,9 +1825,9 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1888,9 +1859,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1917,9 +1888,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1947,9 +1918,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1982,9 +1953,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -2049,9 +2020,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; X86-SHLD-NEXT: subl $92, %esp
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: movups (%eax), %xmm0
; X86-SHLD-NEXT: movups 16(%eax), %xmm1
-; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
@@ -2093,9 +2064,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $92, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
More information about the llvm-commits
mailing list