[llvm] 0586023 - [X86] X86FixupInstTuning - fold BLENDPS -> MOVSD (#144029)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 13 03:06:01 PDT 2025
Author: Simon Pilgrim
Date: 2025-06-13T11:05:57+01:00
New Revision: 058602372e2bb7460469c5c53cc36f0a4b131f54
URL: https://github.com/llvm/llvm-project/commit/058602372e2bb7460469c5c53cc36f0a4b131f54
DIFF: https://github.com/llvm/llvm-project/commit/058602372e2bb7460469c5c53cc36f0a4b131f54.diff
LOG: [X86] X86FixupInstTuning - fold BLENDPS -> MOVSD (#144029)
Reduces codesize - make use of free PS<->PD domain transfers (like we do in many other places) and replace a suitable BLENDPS mask with MOVSD if OptSize or the scheduler prefers it
Added:
Modified:
llvm/lib/Target/X86/X86FixupInstTuning.cpp
llvm/test/CodeGen/X86/avx-insertelt.ll
llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
llvm/test/CodeGen/X86/combine-and.ll
llvm/test/CodeGen/X86/combine-or-shuffle.ll
llvm/test/CodeGen/X86/commute-blend-sse41.ll
llvm/test/CodeGen/X86/horizontal-sum.ll
llvm/test/CodeGen/X86/insertelement-zero.ll
llvm/test/CodeGen/X86/masked_load.ll
llvm/test/CodeGen/X86/sse-insertelt.ll
llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
llvm/test/CodeGen/X86/sse2.ll
llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
llvm/test/CodeGen/X86/vec_floor.ll
llvm/test/CodeGen/X86/vector-blend.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
llvm/test/CodeGen/X86/vector-mul.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll
llvm/test/CodeGen/X86/vselect-2.ll
llvm/test/CodeGen/X86/vselect.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index be0a8c23ea5c4..ce1e4966553f5 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -222,8 +222,9 @@ bool X86FixupInstTuningPass::processInstruction(
return ProcessUNPCKToIntDomain(NewOpc);
};
- auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
- if (MI.getOperand(NumOperands - 1).getImm() != 1)
+ auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
+ unsigned MovImm) -> bool {
+ if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
return false;
bool Force = MF.getFunction().hasOptSize();
if (!Force && !NewOpcPreferable(MovOpc))
@@ -235,14 +236,16 @@ bool X86FixupInstTuningPass::processInstruction(
switch (Opc) {
case X86::BLENDPDrri:
- return ProcessBLENDToMOV(X86::MOVSDrr);
+ return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
case X86::VBLENDPDrri:
- return ProcessBLENDToMOV(X86::VMOVSDrr);
+ return ProcessBLENDToMOV(X86::VMOVSDrr, 0x3, 0x1);
case X86::BLENDPSrri:
- return ProcessBLENDToMOV(X86::MOVSSrr);
+ return ProcessBLENDToMOV(X86::MOVSSrr, 0xF, 0x1) ||
+ ProcessBLENDToMOV(X86::MOVSDrr, 0xF, 0x3);
case X86::VBLENDPSrri:
- return ProcessBLENDToMOV(X86::VMOVSSrr);
+ return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) ||
+ ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3);
case X86::VPERMILPDri:
return ProcessVPERMILPDri(X86::VSHUFPDrri);
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
index 02e6c9649c9a1..f8feceb0404b5 100644
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -111,7 +111,7 @@ define <4 x double> @insert_f64_firstelt_of_high_subvector(<4 x double> %x, doub
; AVX-LABEL: insert_f64_firstelt_of_high_subvector:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 966662f5f9f8f..f0203b3b889e4 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -300,8 +300,8 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_sse41_blendpd:
; CHECK: # %bb.0:
-; CHECK-NEXT: vblendps $3, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x03]
-; CHECK-NEXT: # xmm0 = xmm0[0,1],xmm1[2,3]
+; CHECK-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
+; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[1]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
diff --git a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
index 086df87d1d5ff..441c79b3fc31f 100644
--- a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
+++ b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
@@ -19,12 +19,12 @@ define <2 x double> @insert_f64(double %a0, <2 x double> %a1) {
;
; AVX-LABEL: insert_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: insert_f64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512-NEXT: retq
%1 = insertelement <2 x double> %a1, double %a0, i32 0
ret <2 x double> %1
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index 9ca4ebfec2774..a476b21979cef 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -127,7 +127,7 @@ define <4 x i32> @test7(<4 x i32> %A) {
; SSE-LABEL: test7:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: test7:
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 2f2a05fa6939b..14e3767f65564 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -31,15 +31,10 @@ define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test2:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test2:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2:
; AVX: # %bb.0:
@@ -53,15 +48,10 @@ define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: test3:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test3:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test3:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test3:
; AVX: # %bb.0:
@@ -201,15 +191,10 @@ define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test9:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test9:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test9:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test9:
; AVX: # %bb.0:
@@ -223,15 +208,10 @@ define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: test10:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test10:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test10:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test10:
; AVX: # %bb.0:
@@ -563,20 +543,25 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
; bitcast to use the mask-or blend combine.
define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
-; SSE2-LABEL: test22:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
+; SSE-LABEL: test22:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
-; SSE4-LABEL: test22:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; AVX1-LABEL: test22:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: retq
;
-; AVX-LABEL: test22:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT: retq
+; AVX2-LABEL: test22:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test22:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-NEXT: retq
%bc1 = bitcast <2 x double> %a0 to <2 x i64>
%bc2 = bitcast <2 x double> %a1 to <2 x i64>
%and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
@@ -614,20 +599,25 @@ define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
-; SSE2-LABEL: test24:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
+; SSE-LABEL: test24:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
-; SSE4-LABEL: test24:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; AVX1-LABEL: test24:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: retq
;
-; AVX-LABEL: test24:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT: retq
+; AVX2-LABEL: test24:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test24:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-NEXT: retq
%bc1 = bitcast <4 x float> %a0 to <2 x i64>
%bc2 = bitcast <4 x float> %a1 to <2 x i64>
%and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
@@ -707,15 +697,10 @@ define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
; Verify that we can fold regardless of which operand is the zeroinitializer
define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2b:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test2b:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test2b:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2b:
; AVX: # %bb.0:
@@ -728,15 +713,10 @@ define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
}
define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2c:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test2c:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test2c:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2c:
; AVX: # %bb.0:
@@ -750,15 +730,10 @@ define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2d:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test2d:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test2d:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2d:
; AVX: # %bb.0:
@@ -773,15 +748,10 @@ define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
; Make sure we can have an undef where an index pointing to the zero vector should be
define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2e:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test2e:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test2e:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2e:
; AVX: # %bb.0:
@@ -794,15 +764,10 @@ define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
}
define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2f:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: test2f:
-; SSE4: # %bb.0:
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT: retq
+; SSE-LABEL: test2f:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2f:
; AVX: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/commute-blend-sse41.ll b/llvm/test/CodeGen/X86/commute-blend-sse41.ll
index 07d6a8ba22bb1..4740bf59a69e7 100644
--- a/llvm/test/CodeGen/X86/commute-blend-sse41.ll
+++ b/llvm/test/CodeGen/X86/commute-blend-sse41.ll
@@ -57,7 +57,7 @@ define void @baz(ptr %arg, ptr %arg1) optsize {
; CHECK-NEXT: movaps (%rdi), %xmm0
; CHECK-NEXT: movaps {{.*#+}} xmm1 = [3,3]
; CHECK-NEXT: andps %xmm0, %xmm1
-; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; CHECK-NEXT: movups %xmm1, (%rsi)
; CHECK-NEXT: retq
bb:
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 443275e11459d..0afc4f784bc5e 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -577,7 +577,7 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
-; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm3, %xmm4, %xmm4
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
@@ -596,7 +596,7 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
-; AVX-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX-FAST-NEXT: vmovsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll
index 6036eddb0ca84..b66ad07c466e1 100644
--- a/llvm/test/CodeGen/X86/insertelement-zero.ll
+++ b/llvm/test/CodeGen/X86/insertelement-zero.ll
@@ -30,13 +30,13 @@ define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
; SSE41-LABEL: insert_v2f64_z1:
; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v2f64_z1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%1 = insertelement <2 x double> %a, double 0.0, i32 0
ret <2 x double> %1
@@ -68,7 +68,7 @@ define <4 x double> @insert_v4f64_0zz3(<4 x double> %a) {
; SSE41: # %bb.0:
; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v4f64_0zz3:
@@ -103,7 +103,7 @@ define <2 x i64> @insert_v2i64_z1(<2 x i64> %a) {
; SSE41-LABEL: insert_v2i64_z1:
; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v2i64_z1:
@@ -137,7 +137,7 @@ define <4 x i64> @insert_v4i64_01z3(<4 x i64> %a) {
; SSE41-LABEL: insert_v4i64_01z3:
; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v4i64_01z3:
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 37ab4276fbcca..8c4bab99a5b7b 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -6512,7 +6512,7 @@ define <8 x float> @mload_constmask_v8f32(ptr %addr, <8 x float> %dst) {
; SSE42-LABEL: mload_constmask_v8f32:
; SSE42: ## %bb.0:
; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; SSE42-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE42-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sse-insertelt.ll b/llvm/test/CodeGen/X86/sse-insertelt.ll
index f174eaaca38c2..72e002ed6b7db 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt.ll
@@ -21,19 +21,14 @@ define <4 x float> @insert_f32_firstelt(<4 x float> %x, float %s) {
}
define <2 x double> @insert_f64_firstelt(<2 x double> %x, double %s) {
-; SSE2-LABEL: insert_f64_firstelt:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: insert_f64_firstelt:
-; SSE41: # %bb.0:
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: insert_f64_firstelt:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: insert_f64_firstelt:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%i0 = insertelement <2 x double> %x, double %s, i32 0
ret <2 x double> %i0
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 12bfb8d4fc9cf..325f735b09cd9 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,X86-SSE2
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,X86-SSE41
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX1
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,X64-SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX512
@@ -1333,29 +1333,17 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
}
define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; X86-SSE2-LABEL: add_sd_mask:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: testb $1, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: jne .LBB71_1
-; X86-SSE2-NEXT: # %bb.2:
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X86-SSE2-NEXT: retl
-; X86-SSE2-NEXT: .LBB71_1:
-; X86-SSE2-NEXT: addsd %xmm0, %xmm1
-; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X86-SSE2-NEXT: retl
-;
-; X86-SSE41-LABEL: add_sd_mask:
-; X86-SSE41: # %bb.0:
-; X86-SSE41-NEXT: testb $1, {{[0-9]+}}(%esp)
-; X86-SSE41-NEXT: jne .LBB71_1
-; X86-SSE41-NEXT: # %bb.2:
-; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; X86-SSE41-NEXT: retl
-; X86-SSE41-NEXT: .LBB71_1:
-; X86-SSE41-NEXT: addsd %xmm0, %xmm1
-; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X86-SSE41-NEXT: retl
+; X86-SSE-LABEL: add_sd_mask:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: jne .LBB71_1
+; X86-SSE-NEXT: # %bb.2:
+; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X86-SSE-NEXT: retl
+; X86-SSE-NEXT: .LBB71_1:
+; X86-SSE-NEXT: addsd %xmm0, %xmm1
+; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE-NEXT: retl
;
; X86-AVX1-LABEL: add_sd_mask:
; X86-AVX1: # %bb.0:
@@ -1375,29 +1363,17 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
; X86-AVX512-NEXT: vmovapd %xmm2, %xmm0
; X86-AVX512-NEXT: retl
;
-; X64-SSE2-LABEL: add_sd_mask:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: testb $1, %dil
-; X64-SSE2-NEXT: jne .LBB71_1
-; X64-SSE2-NEXT: # %bb.2:
-; X64-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X64-SSE2-NEXT: retq
-; X64-SSE2-NEXT: .LBB71_1:
-; X64-SSE2-NEXT: addsd %xmm0, %xmm1
-; X64-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X64-SSE2-NEXT: retq
-;
-; X64-SSE41-LABEL: add_sd_mask:
-; X64-SSE41: # %bb.0:
-; X64-SSE41-NEXT: testb $1, %dil
-; X64-SSE41-NEXT: jne .LBB71_1
-; X64-SSE41-NEXT: # %bb.2:
-; X64-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; X64-SSE41-NEXT: retq
-; X64-SSE41-NEXT: .LBB71_1:
-; X64-SSE41-NEXT: addsd %xmm0, %xmm1
-; X64-SSE41-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X64-SSE41-NEXT: retq
+; X64-SSE-LABEL: add_sd_mask:
+; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: testb $1, %dil
+; X64-SSE-NEXT: jne .LBB71_1
+; X64-SSE-NEXT: # %bb.2:
+; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X64-SSE-NEXT: retq
+; X64-SSE-NEXT: .LBB71_1:
+; X64-SSE-NEXT: addsd %xmm0, %xmm1
+; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: add_sd_mask:
; X64-AVX1: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 18a6be8aaf0b6..3f48b22e2b9ff 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -3010,8 +3010,8 @@ define <2 x double> @test_mm_move_sd(<2 x double> %a0, <2 x double> %a1) nounwin
;
; AVX-LABEL: test_mm_move_sd:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; AVX-NEXT: # xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ext0 = extractelement <2 x double> %a1, i32 0
%res0 = insertelement <2 x double> undef, double %ext0, i32 0
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index 6dd75c8c09ce5..413b4e79257a0 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -724,8 +724,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
; X86-AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x08]
; X86-AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0xc9]
-; X86-AVX1-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X86-AVX1-NEXT: ## xmm0 = xmm1[0,1],xmm0[2,3]
+; X86-AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X86-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1]
; X86-AVX1-NEXT: retl ## encoding: [0xc3]
;
; X86-AVX512-LABEL: test_x86_sse2_cvtss2sd_load:
@@ -734,8 +734,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
; X86-AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08]
; X86-AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9]
-; X86-AVX512-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X86-AVX512-NEXT: ## xmm0 = xmm1[0,1],xmm0[2,3]
+; X86-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1]
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
; X64-SSE-LABEL: test_x86_sse2_cvtss2sd_load:
@@ -752,8 +752,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
; X64-AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x0f]
; X64-AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0xc9]
-; X64-AVX1-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X64-AVX1-NEXT: ## xmm0 = xmm1[0,1],xmm0[2,3]
+; X64-AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X64-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1]
; X64-AVX1-NEXT: retq ## encoding: [0xc3]
;
; X64-AVX512-LABEL: test_x86_sse2_cvtss2sd_load:
@@ -761,8 +761,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
; X64-AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f]
; X64-AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9]
-; X64-AVX512-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X64-AVX512-NEXT: ## xmm0 = xmm1[0,1],xmm0[2,3]
+; X64-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1]
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%a1 = load <4 x float>, ptr %p1
%res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll
index e1d91b407fc28..6e77d3e4fd134 100644
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@@ -417,7 +417,7 @@ define void @test12() nounwind {
; AVX512: # %bb.0:
; AVX512-NEXT: vmovaps 0, %xmm0
; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
index 47630501864a5..c6f0ec493a36c 100644
--- a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
@@ -33,7 +33,7 @@ define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) {
;
; AVX-LABEL: test_mm_blend_pd:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: ret{{[l|q]}}
%res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
ret <2 x double> %res
diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
index bdf8033a00b0a..137606b7cfeed 100644
--- a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -18,8 +18,8 @@ define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1)
;
; AVX-LABEL: test_x86_sse41_blendpd:
; AVX: ## %bb.0:
-; AVX-NEXT: vblendps $3, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x03]
-; AVX-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX-NEXT: ## xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 6) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
index a6e288608c87b..35688e59fc9f4 100644
--- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
@@ -93,7 +93,7 @@ define <2 x double> @f12(<2 x double> %a0, <8 x half> %a1) #0 {
; CHECK-LABEL: f12:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsh2sd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; CHECK-NEXT: ret{{[l|q]}}
%ext = extractelement <8 x half> %a1, i32 0
%cvt = call double @llvm.experimental.constrained.fpext.f64.f16(half %ext,
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index 1007969b6c6d1..7f4ed3394d10d 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -1653,7 +1653,7 @@ define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16
; AVX-NEXT: jne LBB59_1
; AVX-NEXT: ## %bb.2:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
; AVX-NEXT: LBB59_1:
; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
@@ -2643,7 +2643,7 @@ define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %
; AVX-NEXT: jne LBB85_1
; AVX-NEXT: ## %bb.2:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
; AVX-NEXT: LBB85_1:
; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index a38028e87532f..2d2fc6b6ee0d7 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -172,7 +172,7 @@ define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
;
; AVX-LABEL: vsel_double:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
entry:
%vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
@@ -732,23 +732,11 @@ entry:
}
define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
-; SSE2-LABEL: blend_shufflevector_4xi64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movaps %xmm3, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: blend_shufflevector_4xi64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movaps %xmm3, %xmm1
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: blend_shufflevector_4xi64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movaps %xmm3, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: blend_shufflevector_4xi64:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: movaps %xmm3, %xmm1
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: blend_shufflevector_4xi64:
; AVX: # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 0bf1260738439..822d31eb45139 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -250,10 +250,10 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0]
; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero
; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm5 = xmm5[0],xmm6[1]
; AVX-NEXT: vunpckhps {{.*#+}} xmm6 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm2[2],xmm3[2]
-; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm6 = xmm6[0],xmm7[1]
; AVX-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
@@ -584,14 +584,14 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,0],ymm9[2,3],ymm10[6,4],ymm9[6,7]
; AVX-NEXT: vinsertps {{.*#+}} xmm10 = xmm5[1],xmm6[1],zero,zero
; AVX-NEXT: vunpcklps {{.*#+}} xmm11 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm10 = xmm10[0],xmm11[1]
; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
; AVX-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
; AVX-NEXT: vunpckhps {{.*#+}} ymm11 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7]
; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4]
; AVX-NEXT: vunpckhps {{.*#+}} xmm11 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm7[2],xmm8[2]
-; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm11 = xmm11[0],xmm12[1]
; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[3,0],ymm0[7,4],ymm4[7,4]
@@ -1080,7 +1080,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm2[0],xmm10[1],xmm2[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm6 = xmm6[0],xmm7[1]
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm3[4,5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps %ymm12, %ymm0
@@ -1094,7 +1094,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm6 = xmm6[0],xmm7[1]
; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -1105,7 +1105,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm10[2],xmm3[2]
; AVX-NEXT: vmovaps %xmm10, %xmm14
-; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm6 = xmm6[0],xmm7[1]
; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
@@ -1115,7 +1115,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm9[2],xmm1[2]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
@@ -2120,7 +2120,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload
; AVX-NEXT: # xmm14 = xmm9[0],mem[0],xmm9[1],mem[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm13[0],ymm8[1],ymm13[1],ymm8[4],ymm13[4],ymm8[5],ymm13[5]
@@ -2131,7 +2131,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
; AVX-NEXT: # xmm14 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -2147,7 +2147,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
; AVX-NEXT: # xmm14 = xmm5[0],mem[0],xmm5[1],mem[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -2164,7 +2164,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -2176,7 +2176,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload
; AVX-NEXT: # xmm14 = zero,zero,xmm9[2],mem[0]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
@@ -2187,7 +2187,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
; AVX-NEXT: # xmm14 = zero,zero,xmm5[2],mem[0]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
@@ -2203,7 +2203,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm2[2],xmm4[2]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
@@ -2215,7 +2215,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm7[2],xmm13[2]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
@@ -4239,7 +4239,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm13[0],xmm3[1],xmm13[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -4253,7 +4253,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4269,7 +4269,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4285,7 +4285,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4301,7 +4301,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4317,7 +4317,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4333,7 +4333,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4349,7 +4349,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3]
@@ -4358,7 +4358,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
; AVX-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm7[2]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
@@ -4368,7 +4368,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm11[2]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -4384,7 +4384,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm3[2]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
@@ -4400,7 +4400,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4416,7 +4416,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4432,7 +4432,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4448,7 +4448,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4464,7 +4464,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
index c08442f9d9d01..4f80140bc6c1b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
@@ -359,7 +359,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
; AVX-NEXT: vmovaps (%rdi), %xmm3
; AVX-NEXT: vmovaps 32(%rdi), %xmm4
-; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm4[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm5 = xmm3[0],xmm4[1]
; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3]
; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,2,3,3]
; AVX-NEXT: vmovaps 64(%rdi), %xmm6
@@ -369,7 +369,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2],xmm7[3]
; AVX-NEXT: vshufpd {{.*#+}} xmm7 = xmm7[1,0]
; AVX-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[1]
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],mem[1,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[2]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
@@ -787,7 +787,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm7[3,0],ymm9[6,4],ymm7[7,4]
; AVX-NEXT: vmovaps (%rdi), %xmm9
; AVX-NEXT: vmovaps 32(%rdi), %xmm10
-; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm9[0,1],xmm10[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm11 = xmm9[0],xmm10[1]
; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3]
; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7]
@@ -806,7 +806,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4]
; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,2],ymm12[6,4],ymm1[6,6]
-; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm9 = xmm10[0],xmm9[1]
; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,0],mem[1,3]
; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3,4,5,6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm11[1,0],ymm0[6,4],ymm11[5,4]
@@ -1552,7 +1552,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4]
; AVX-NEXT: vmovaps (%rdi), %xmm15
; AVX-NEXT: vmovaps 32(%rdi), %xmm10
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1],xmm10[2,3]
+; AVX-NEXT: vmovsd %xmm15, %xmm10, %xmm4
; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7]
@@ -1565,7 +1565,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[3,0],ymm1[6,4],ymm4[7,4]
; AVX-NEXT: vmovaps 160(%rdi), %xmm9
; AVX-NEXT: vmovaps 192(%rdi), %xmm8
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm8[2,3]
+; AVX-NEXT: vmovsd %xmm9, %xmm8, %xmm4
; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7]
@@ -1597,7 +1597,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4]
; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm13[6,7]
-; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm10 = xmm10[0],xmm15[1]
; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4]
; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm5[2,2],ymm13[6,4],ymm5[6,6]
; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],mem[1,3]
@@ -1605,7 +1605,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4]
; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm8 = xmm8[0],xmm9[1]
; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,0],ymm2[3,0],ymm7[4,4],ymm2[7,4]
; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0],ymm2[2,2],ymm9[6,4],ymm2[6,6]
; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3]
@@ -3086,7 +3086,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 192(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm0[1]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7]
@@ -3102,7 +3102,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 512(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -3118,7 +3118,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 32(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -3134,7 +3134,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 352(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6148,7 +6148,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 192(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6165,7 +6165,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 512(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6182,7 +6182,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 832(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6198,7 +6198,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 1152(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6214,7 +6214,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 32(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6230,7 +6230,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 352(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6247,7 +6247,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 672(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6265,7 +6265,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovaps 992(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 98b5bab98c4f9..13b21a747878b 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -1579,7 +1579,7 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
; SSE4-LABEL: mul_v2i64_0_1:
; SSE4: # %bb.0:
; SSE4-NEXT: xorps %xmm1, %xmm1
-; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE4-NEXT: ret{{[l|q]}}
;
; X64-AVX-LABEL: mul_v2i64_0_1:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 2d3dc4c593c11..baaae507ae15c 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -242,35 +242,20 @@ define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
;
; AVX-LABEL: shuffle_v2f64_03:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
-; SSE2-LABEL: shuffle_v2f64_21:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE3-LABEL: shuffle_v2f64_21:
-; SSE3: # %bb.0:
-; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE3-NEXT: retq
-;
-; SSSE3-LABEL: shuffle_v2f64_21:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuffle_v2f64_21:
-; SSE41: # %bb.0:
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: shuffle_v2f64_21:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2f64_21:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
ret <2 x double> %shuffle
@@ -523,25 +508,10 @@ define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: shuffle_v2i64_21:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE3-LABEL: shuffle_v2i64_21:
-; SSE3: # %bb.0:
-; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE3-NEXT: retq
-;
-; SSSE3-LABEL: shuffle_v2i64_21:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuffle_v2i64_21:
-; SSE41: # %bb.0:
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: shuffle_v2i64_21:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_21:
; AVX: # %bb.0:
@@ -572,7 +542,7 @@ define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
; SSE41-LABEL: shuffle_v2i64_21_copy:
; SSE41: # %bb.0:
; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_21_copy:
@@ -740,7 +710,7 @@ define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
; SSE41-LABEL: shuffle_v2i64_z1:
; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_z1:
@@ -821,13 +791,13 @@ define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) {
; SSE41-LABEL: shuffle_v2f64_z1:
; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v2f64_z1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
ret <2 x double> %shuffle
@@ -1102,7 +1072,7 @@ define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
;
; AVX-LABEL: insert_reg_lo_v2f64:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%v = insertelement <2 x double> poison, double %a, i32 0
%shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
@@ -1334,7 +1304,7 @@ define <2 x double> @shuffle_mem_v2f64_21(<2 x double> %a, ptr %pb) {
; SSE41-LABEL: shuffle_mem_v2f64_21:
; SSE41: # %bb.0:
; SSE41-NEXT: movups (%rdi), %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_mem_v2f64_21:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index e1eb1a6704e39..9ec24c447c2cc 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -2222,7 +2222,7 @@ define <4 x i32> @insert_mem_lo_v4i32(ptr %ptr, <4 x i32> %b) {
; SSE41-LABEL: insert_mem_lo_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_mem_lo_v4i32:
@@ -2295,7 +2295,7 @@ define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
;
; AVX-LABEL: insert_reg_lo_v4f32:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%a.cast = bitcast double %a to <2 x float>
%v = shufflevector <2 x float> %a.cast, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -2489,7 +2489,7 @@ define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, ptr %pb) {
; SSE41-LABEL: shuffle_mem_v4f32_4523:
; SSE41: # %bb.0:
; SSE41-NEXT: movups (%rdi), %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_mem_v4f32_4523:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 950683cbfaeea..bce50db4d952e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -468,7 +468,7 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 002a3b77dc353..bd2710139d584 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -53,7 +53,7 @@ define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1)
;
; AVX-LABEL: combine_pshufb_as_movsd:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 3, i32 0>
%2 = bitcast <2 x double> %1 to <16 x i8>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll
index 2812bf3489101..925f8d5104510 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll
@@ -173,7 +173,7 @@ define void @concat_a_to_shuf_of_ab(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
; SSE42: # %bb.0:
; SSE42-NEXT: movaps (%rdi), %xmm0
; SSE42-NEXT: movaps (%rsi), %xmm1
-; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; SSE42-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE42-NEXT: movaps %xmm0, 16(%rdx)
; SSE42-NEXT: movaps %xmm1, (%rdx)
; SSE42-NEXT: retq
@@ -288,7 +288,7 @@ define void @concat_shuf_of_ab_to_a(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
; SSE42: # %bb.0:
; SSE42-NEXT: movaps (%rdi), %xmm0
; SSE42-NEXT: movaps (%rsi), %xmm1
-; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; SSE42-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE42-NEXT: movaps %xmm1, 16(%rdx)
; SSE42-NEXT: movaps %xmm0, (%rdx)
; SSE42-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vselect-2.ll b/llvm/test/CodeGen/X86/vselect-2.ll
index c02cbcf55408d..429ae88fe6d6f 100644
--- a/llvm/test/CodeGen/X86/vselect-2.ll
+++ b/llvm/test/CodeGen/X86/vselect-2.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: test1:
@@ -24,15 +24,10 @@ define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
}
define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
-; SSE2-LABEL: test2:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test2:
-; SSE41: # %bb.0:
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: test2:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test2:
; AVX: # %bb.0:
@@ -55,26 +50,21 @@ define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
;
; AVX-LABEL: test3:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x float> %A, <4 x float> %B
ret <4 x float> %select
}
define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
-; SSE2-LABEL: test4:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test4:
-; SSE41: # %bb.0:
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: test4:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test4:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
ret <4 x float> %select
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 18a060ad910b7..f70145d6b21c2 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -69,26 +69,21 @@ define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
;
; AVX-LABEL: test2:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
}
define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: test3:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test3:
-; SSE41: # %bb.0:
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: test3:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test3:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
@@ -152,15 +147,10 @@ define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
-; SSE2-LABEL: test8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: test8:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test8:
; AVX: # %bb.0:
@@ -329,34 +319,24 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
}
define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
-; SSE2-LABEL: test20:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test20:
-; SSE41: # %bb.0:
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: test20:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test20:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b
ret <2 x double> %1
}
define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: test21:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test21:
-; SSE41: # %bb.0:
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: test21:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: test21:
; AVX: # %bb.0:
@@ -419,7 +399,7 @@ define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
;
; AVX-LABEL: test24:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
ret <2 x double> %1
More information about the llvm-commits
mailing list