[llvm] [X86] Use X86FixupInstTunings to select between (V)MOVSS/D and (V)BLENDPS/D (PR #143312)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 10 07:54:01 PDT 2025
https://github.com/houngkoungting updated https://github.com/llvm/llvm-project/pull/143312
>From 4547715ec05943e2a101583bc0a4a2bc2ac55bc7 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Sun, 8 Jun 2025 22:34:17 +0800
Subject: [PATCH 1/5] [X86] Use X86FixupInstTunings to select between
(V)MOVSS/D and (V)BLENDPS/D
---
llvm/lib/Target/X86/X86FixupInstTuning.cpp | 46 +++++++++++
.../test/CodeGen/X86/2012-01-12-extract-sv.ll | 4 +-
llvm/test/CodeGen/X86/avx-insertelt.ll | 8 +-
.../X86/avx512-intrinsics-fast-isel.ll | 4 +-
.../CodeGen/X86/avx512-intrinsics-upgrade.ll | 2 +-
llvm/test/CodeGen/X86/avx512-intrinsics.ll | 2 +-
.../test/CodeGen/X86/avx512copy-intrinsics.ll | 2 +-
llvm/test/CodeGen/X86/build-vector-512.ll | 6 +-
llvm/test/CodeGen/X86/buildvec-extract.ll | 6 +-
.../CodeGen/X86/canonicalize-vars-f16-type.ll | 8 +-
.../CodeGen/X86/coalesce_commute_movsd.ll | 4 +-
llvm/test/CodeGen/X86/combine-and.ll | 4 +-
llvm/test/CodeGen/X86/combine-or-shuffle.ll | 80 ++++++++++++++-----
.../CodeGen/X86/fminimumnum-fmaximumnum.ll | 2 +-
llvm/test/CodeGen/X86/fmsubadd-combine.ll | 4 +-
.../test/CodeGen/X86/fp-strict-scalar-fp16.ll | 14 ++--
.../X86/fp-strict-scalar-inttofp-fp16.ll | 24 +++---
.../X86/fp-strict-scalar-round-fp16.ll | 14 ++--
llvm/test/CodeGen/X86/half-constrained.ll | 6 +-
llvm/test/CodeGen/X86/half-darwin.ll | 2 +-
llvm/test/CodeGen/X86/insertelement-zero.ll | 4 +-
llvm/test/CodeGen/X86/masked_expandload.ll | 14 ++--
llvm/test/CodeGen/X86/masked_gather.ll | 12 +--
.../test/CodeGen/X86/masked_gather_scatter.ll | 2 +-
llvm/test/CodeGen/X86/masked_load.ll | 2 +-
llvm/test/CodeGen/X86/oddsubvector.ll | 4 +-
llvm/test/CodeGen/X86/pr40730.ll | 4 +-
llvm/test/CodeGen/X86/scalarize-fp.ll | 2 +-
.../CodeGen/X86/sse-insertelt-from-mem.ll | 2 +-
llvm/test/CodeGen/X86/sse-insertelt.ll | 2 +-
.../CodeGen/X86/sse-intrinsics-fast-isel.ll | 16 ++--
llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll | 16 ++--
.../CodeGen/X86/sse2-intrinsics-fast-isel.ll | 8 +-
llvm/test/CodeGen/X86/sse2.ll | 2 +-
llvm/test/CodeGen/X86/sse41.ll | 12 +--
.../test/CodeGen/X86/stack-folding-fp-avx1.ll | 4 +-
llvm/test/CodeGen/X86/vec-strict-128-fp16.ll | 2 +-
.../X86/vec-strict-fptoint-128-fp16.ll | 20 ++---
llvm/test/CodeGen/X86/vec_extract-avx.ll | 8 +-
llvm/test/CodeGen/X86/vec_floor.ll | 68 ++++++++--------
llvm/test/CodeGen/X86/vec_ss_load_fold.ll | 8 +-
llvm/test/CodeGen/X86/vector-blend.ll | 2 +-
.../CodeGen/X86/vector-half-conversions.ll | 4 +-
.../vector-interleaved-store-i32-stride-5.ll | 4 +-
.../vector-interleaved-store-i32-stride-7.ll | 2 +-
.../vector-interleaved-store-i64-stride-5.ll | 10 +--
.../test/CodeGen/X86/vector-shuffle-128-v2.ll | 2 +-
.../test/CodeGen/X86/vector-shuffle-128-v4.ll | 16 ++--
.../test/CodeGen/X86/vector-shuffle-256-v4.ll | 32 +++-----
.../test/CodeGen/X86/vector-shuffle-256-v8.ll | 14 ++--
.../X86/vector-shuffle-combining-avx2.ll | 2 +-
.../X86/vector-shuffle-combining-ssse3.ll | 4 +-
.../X86/vector-shuffle-combining-xop.ll | 2 +-
llvm/test/CodeGen/X86/vector-zmov.ll | 2 +-
llvm/test/CodeGen/X86/vselect.ll | 8 +-
55 files changed, 316 insertions(+), 242 deletions(-)
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 6bb7600dedcac..7f4d73e89f472 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -221,8 +221,54 @@ bool X86FixupInstTuningPass::processInstruction(
auto ProcessUNPCKPS = [&](unsigned NewOpc) -> bool {
return ProcessUNPCKToIntDomain(NewOpc);
};
+
+
+
+ auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
+ if (!MI.getOperand(NumOperands - 1).isImm() ||
+ MI.getOperand(NumOperands - 1).getImm() != 1)
+ return false;
+
+ bool Force = MF.getFunction().hasOptSize();
+
+ if (!Force && !NewOpcPreferable(MovOpc))
+ return false;
+ MI.setDesc(TII->get(MovOpc));
+ MI.removeOperand(NumOperands - 1);
+ return true;
+ };
+
+
+
switch (Opc) {
+
+ case X86::VBLENDPSrri:
+ case X86::VBLENDPSYrri:
+ case X86::VBLENDMPSZ128rrkz:
+ case X86::VBLENDMPSZ256rrkz:
+ case X86::VBLENDMPSZrrkz: {
+ int Imm = MI.getOperand(NumOperands - 1).getImm();
+ if (Imm != 1)
+ return false;
+ return ProcessBLENDToMOV(X86::VMOVSSrr);
+ }
+
+ case X86::VBLENDPDrri:
+ case X86::VBLENDPDYrri:
+ case X86::VBLENDMPDZ128rrkz:
+ case X86::VBLENDMPDZ256rrkz:
+ case X86::VBLENDMPDZrrkz: {
+ int Imm = MI.getOperand(NumOperands - 1).getImm();
+ if (Imm != 1)
+ return false;
+ return ProcessBLENDToMOV(X86::VMOVSDrr);
+ }
+
+
+
+
+
case X86::VPERMILPDri:
return ProcessVPERMILPDri(X86::VSHUFPDrri);
case X86::VPERMILPDYri:
diff --git a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
index 254a53fcac4de..65273870c3dfb 100644
--- a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -11,7 +11,7 @@ define void @endless_loop() {
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX1-NEXT: vmovaps %ymm0, (%eax)
; AVX1-NEXT: vmovaps %ymm1, (%eax)
; AVX1-NEXT: vzeroupper
@@ -21,7 +21,7 @@ define void @endless_loop() {
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vbroadcastss (%eax), %xmm0
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
index 18ca01290c914..81f3058f19579 100644
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -8,7 +8,7 @@ define <8 x float> @insert_f32_firstelt_of_low_subvector(<8 x float> %x, float %
; ALL-LABEL: insert_f32_firstelt_of_low_subvector:
; ALL: # %bb.0:
; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
+; ALL-NEXT: vmovss {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; ALL-NEXT: retq
%i0 = insertelement <8 x float> %x, float %s, i32 0
ret <8 x float> %i0
@@ -94,7 +94,7 @@ define <8 x float> @insert_f32_firstelt_of_high_subvector(<8 x float> %x, float
; AVX-LABEL: insert_f32_firstelt_of_high_subvector:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
@@ -202,9 +202,9 @@ define <4 x i64> @insert_i64_firstelt_of_high_subvector(<4 x i64> %x, i64 %s) {
define <8 x float> @insert_f32_firstelts(<8 x float> %x, float %s) {
; AVX-LABEL: insert_f32_firstelts:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index a8574c0b7516c..30bf1a261f4b7 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -1843,7 +1843,7 @@ define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
; X86-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
; X86-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
; X86-NEXT: vaddsd %xmm1, %xmm2, %xmm1
-; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X86-NEXT: retl
;
; X64-LABEL: test_mm_cvtu64_sd:
@@ -1891,7 +1891,7 @@ define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
; X86-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
; X86-NEXT: .cfi_def_cfa %esp, 4
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index c1ef500d9d3de..aae48aba93be6 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -10483,7 +10483,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; CHECK-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
; CHECK-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%q = load float, ptr %ptr_b
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index 926af4e9957af..f9b5994a18d36 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -6505,7 +6505,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; CHECK-NEXT: ret{{[l|q]}}
%q = load float, ptr %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
diff --git a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
index a7ca23792e6fe..a2af7df44010e 100644
--- a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
@@ -11,7 +11,7 @@ define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
; NOAVX512MOVZXC-LABEL: test_mm_move_epi32:
; NOAVX512MOVZXC: # %bb.0:
; NOAVX512MOVZXC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; NOAVX512MOVZXC-NEXT: vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; NOAVX512MOVZXC-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3]
%res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll
index 789196c5e4848..69d17fe3ab69f 100644
--- a/llvm/test/CodeGen/X86/build-vector-512.ll
+++ b/llvm/test/CodeGen/X86/build-vector-512.ll
@@ -578,7 +578,7 @@ define <16 x float> @test_buildvector_16f32_2_var(float %a0, float %a1) {
; AVX-32-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,17,0,0]
; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1
-; AVX-32-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
+; AVX-32-NEXT: vmovss {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
; AVX-32-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-32-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0,1,2],xmm2[0]
; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
@@ -626,7 +626,7 @@ define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) {
; AVX-32-NEXT: vbroadcastss (%ecx), %xmm1
; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
-; AVX-32-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
+; AVX-32-NEXT: vmovss {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
; AVX-32-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-32-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
; AVX-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
@@ -640,7 +640,7 @@ define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) {
; AVX-64-NEXT: vbroadcastss (%rdi), %xmm1
; AVX-64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-64-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
-; AVX-64-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
+; AVX-64-NEXT: vmovss {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
; AVX-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll
index 545c57fed4b2c..9d856ed7647ca 100644
--- a/llvm/test/CodeGen/X86/buildvec-extract.ll
+++ b/llvm/test/CodeGen/X86/buildvec-extract.ll
@@ -42,7 +42,7 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; AVX-LABEL: extract0_i32_zext_insert0_i64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
%z = zext i32 %e to i64
@@ -85,7 +85,7 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; AVX: # %bb.0:
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 1
%z = zext i32 %e to i64
@@ -130,7 +130,7 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; AVX: # %bb.0:
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 2
%z = zext i32 %e to i64
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index 556b0deaf4c83..8b3aa2964db02 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -51,7 +51,7 @@ define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind {
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX512-NEXT: retq
@@ -149,7 +149,7 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0
@@ -235,12 +235,12 @@ define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm2
; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX512-NEXT: vmovd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
index b42fd957d7f4f..086df87d1d5ff 100644
--- a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
+++ b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
@@ -44,12 +44,12 @@ define <4 x float> @insert_f32(float %a0, <4 x float> %a1) {
;
; AVX-LABEL: insert_f32:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: insert_f32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: retq
%1 = insertelement <4 x float> %a1, float %a0, i32 0
ret <4 x float> %1
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index e5594dc9c5e3c..173457ff46677 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -37,7 +37,7 @@ define <4 x i32> @test1(<4 x i32> %A) {
; AVX-LABEL: test1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 0>
ret <4 x i32> %1
@@ -195,7 +195,7 @@ define <4 x i32> @test11(<4 x i32> %A) {
; AVX-LABEL: test11:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: retq
%1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 -1>
ret <4 x i32> %1
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 95b5fcf8eac52..137c3d9dec7bd 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -86,10 +86,20 @@ define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE4-NEXT: retq
;
-; AVX-LABEL: test4:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: test4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
%or = or <4 x i32> %shuf1, %shuf2
@@ -108,10 +118,20 @@ define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE4-NEXT: retq
;
-; AVX-LABEL: test5:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: test5:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test5:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test5:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
%or = or <4 x i32> %shuf1, %shuf2
@@ -241,10 +261,20 @@ define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE4-NEXT: retq
;
-; AVX-LABEL: test11:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: test11:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test11:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test11:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: retq
%and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
%and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
%or = or <4 x i32> %and1, %and2
@@ -263,10 +293,20 @@ define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
; SSE4-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE4-NEXT: retq
;
-; AVX-LABEL: test12:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: test12:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test12:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT: retq
%and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
%and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
%or = or <4 x i32> %and1, %and2
@@ -395,18 +435,18 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
; AVX1-LABEL: test18:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test18:
; AVX2: # %bb.0:
; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index 33bc93d0fe4db..95d350d45d901 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -1343,7 +1343,7 @@ define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0]
; AVX-NEXT: vminpd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
;
; AVX10_2-LABEL: test_fminimumnum_vector_nan:
diff --git a/llvm/test/CodeGen/X86/fmsubadd-combine.ll b/llvm/test/CodeGen/X86/fmsubadd-combine.ll
index ddf51b858cdd8..674a1d5ad779b 100644
--- a/llvm/test/CodeGen/X86/fmsubadd-combine.ll
+++ b/llvm/test/CodeGen/X86/fmsubadd-combine.ll
@@ -12,7 +12,7 @@ define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x dou
; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; NOFMA-NEXT: vsubpd %xmm2, %xmm0, %xmm1
; NOFMA-NEXT: vaddpd %xmm2, %xmm0, %xmm0
-; NOFMA-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; NOFMA-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; NOFMA-NEXT: retq
;
; FMA3-LABEL: mul_subadd_pd128:
@@ -191,7 +191,7 @@ define <2 x double> @mul_subadd_bad_commute(<2 x double> %A, <2 x double> %B, <2
; CHECK-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vsubpd %xmm0, %xmm2, %xmm1
; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; CHECK-NEXT: retq
entry:
%AB = fmul <2 x double> %A, %B
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index fbc798d8bbe48..b013ddad19a95 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -44,7 +44,7 @@ define half @fadd_f16(half %a, half %b) nounwind strictfp {
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -92,7 +92,7 @@ define half @fsub_f16(half %a, half %b) nounwind strictfp {
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -140,7 +140,7 @@ define half @fmul_f16(half %a, half %b) nounwind strictfp {
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -188,7 +188,7 @@ define half @fdiv_f16(half %a, half %b) nounwind strictfp {
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -400,7 +400,7 @@ define void @fsqrt_f16(ptr %a) nounwind strictfp {
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX-NEXT: retq
@@ -469,7 +469,7 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq fmaf at PLT
; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: popq %rax
; F16C-NEXT: retq
@@ -490,7 +490,7 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
index 7c0386f0e784e..cb2f2473f5ead 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
@@ -35,7 +35,7 @@ define half @sitofp_i1tof16(i1 %x) #0 {
; AVX-NEXT: movsbl %dil, %eax
; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -76,7 +76,7 @@ define half @sitofp_i8tof16(i8 %x) #0 {
; AVX-NEXT: movsbl %dil, %eax
; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -112,7 +112,7 @@ define half @sitofp_i16tof16(i16 %x) #0 {
; AVX-NEXT: movswl %di, %eax
; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -146,7 +146,7 @@ define half @sitofp_i32tof16(i32 %x) #0 {
; AVX: # %bb.0:
; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -178,7 +178,7 @@ define half @sitofp_i64tof16(i64 %x) #0 {
; AVX: # %bb.0:
; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -213,7 +213,7 @@ define half @uitofp_i1tof16(i1 %x) #0 {
; AVX-NEXT: andl $1, %edi
; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -251,7 +251,7 @@ define half @uitofp_i8tof16(i8 %x) #0 {
; AVX-NEXT: movzbl %dil, %eax
; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -287,7 +287,7 @@ define half @uitofp_i16tof16(i16 %x) #0 {
; AVX-NEXT: movzwl %di, %eax
; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -323,7 +323,7 @@ define half @uitofp_i32tof16(i32 %x) #0 {
; F16C-NEXT: movl %edi, %eax
; F16C-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: retq
;
@@ -331,7 +331,7 @@ define half @uitofp_i32tof16(i32 %x) #0 {
; AVX512: # %bb.0:
; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: retq
;
@@ -387,7 +387,7 @@ define half @uitofp_i64tof16(i64 %x) #0 {
; F16C-NEXT: vaddss %xmm0, %xmm0, %xmm0
; F16C-NEXT: .LBB9_2:
; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: retq
;
@@ -395,7 +395,7 @@ define half @uitofp_i64tof16(i64 %x) #0 {
; AVX512: # %bb.0:
; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
index 1ab97dafb8514..1f9c9edfa2a2d 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
@@ -31,7 +31,7 @@ define half @fceil32(half %f) #0 {
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -67,7 +67,7 @@ define half @ffloor32(half %f) #0 {
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -103,7 +103,7 @@ define half @ftrunc32(half %f) #0 {
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -139,7 +139,7 @@ define half @frint32(half %f) #0 {
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -176,7 +176,7 @@ define half @fnearbyint32(half %f) #0 {
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -213,7 +213,7 @@ define half @froundeven16(half %f) #0 {
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -251,7 +251,7 @@ define half @fround16(half %f) #0 {
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX-NEXT: callq roundf at PLT
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX-NEXT: popq %rax
; AVX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll
index f1874cc03000a..d5f2060ca20e3 100644
--- a/llvm/test/CodeGen/X86/half-constrained.ll
+++ b/llvm/test/CodeGen/X86/half-constrained.ll
@@ -194,7 +194,7 @@ define void @float_to_half(float %0) strictfp {
; X64-F16C-LABEL: float_to_half:
; X64-F16C: # %bb.0:
; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-F16C-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; X64-F16C-NEXT: movq a at GOTPCREL(%rip), %rax
; X64-F16C-NEXT: vpextrw $0, %xmm0, (%rax)
@@ -350,7 +350,7 @@ define void @add() strictfp {
; X86-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; X86-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
; X86-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-F16C-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X86-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; X86-F16C-NEXT: vpextrw $0, %xmm0, c
; X86-F16C-NEXT: retl
@@ -387,7 +387,7 @@ define void @add() strictfp {
; X64-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; X64-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-F16C-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; X64-F16C-NEXT: movq c at GOTPCREL(%rip), %rax
; X64-F16C-NEXT: vpextrw $0, %xmm0, (%rax)
diff --git a/llvm/test/CodeGen/X86/half-darwin.ll b/llvm/test/CodeGen/X86/half-darwin.ll
index 3cbf5c11235ea..8765f7dbe6d34 100644
--- a/llvm/test/CodeGen/X86/half-darwin.ll
+++ b/llvm/test/CodeGen/X86/half-darwin.ll
@@ -105,7 +105,7 @@ define void @strict_truncsfhf(float %in, ptr %ptr) nounwind strictfp {
; CHECK-F16C-LABEL: strict_truncsfhf:
; CHECK-F16C: ## %bb.0:
; CHECK-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-F16C-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; CHECK-F16C-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll
index 952940d141808..31551360be483 100644
--- a/llvm/test/CodeGen/X86/insertelement-zero.ll
+++ b/llvm/test/CodeGen/X86/insertelement-zero.ll
@@ -508,8 +508,8 @@ define <8 x float> @PR41512_v8f32(float %x, float %y) {
; AVX-LABEL: PR41512_v8f32:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
%ins1 = insertelement <8 x float> zeroinitializer, float %x, i32 0
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index 4c5b67962a58b..234c7190256c4 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -204,7 +204,7 @@ define <4 x double> @expandload_v4f64_v4i64(ptr %base, <4 x double> %src0, <4 x
; AVX1-NEXT: retq
; AVX1-NEXT: LBB1_1: ## %cond.load
; AVX1-NEXT: vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
-; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX1-NEXT: vmovsd {{.*#+}} ymm0 = ymm1[0],ymm0[1]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je LBB1_4
@@ -245,7 +245,7 @@ define <4 x double> @expandload_v4f64_v4i64(ptr %base, <4 x double> %src0, <4 x
; AVX2-NEXT: retq
; AVX2-NEXT: LBB1_1: ## %cond.load
; AVX2-NEXT: vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
-; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX2-NEXT: vmovsd {{.*#+}} ymm0 = ymm1[0],ymm0[1]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je LBB1_4
@@ -1120,7 +1120,7 @@ define <2 x float> @expandload_v2f32_v2i1(ptr %base, <2 x float> %src0, <2 x i32
; AVX1OR2-NEXT: retq
; AVX1OR2-NEXT: LBB4_1: ## %cond.load
; AVX1OR2-NEXT: vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
-; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1OR2-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX1OR2-NEXT: addq $4, %rdi
; AVX1OR2-NEXT: testb $2, %al
; AVX1OR2-NEXT: je LBB4_4
@@ -2111,7 +2111,7 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32
; AVX1-NEXT: retq
; AVX1-NEXT: LBB8_1: ## %cond.load
; AVX1-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7]
+; AVX1-NEXT: vmovss {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je LBB8_4
@@ -2159,7 +2159,7 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32
; AVX1-NEXT: je LBB8_18
; AVX1-NEXT: LBB8_17: ## %cond.load29
; AVX1-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7]
+; AVX1-NEXT: vmovss {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $512, %eax ## imm = 0x200
; AVX1-NEXT: je LBB8_20
@@ -2207,7 +2207,7 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32
; AVX1-NEXT: je LBB8_34
; AVX1-NEXT: LBB8_33: ## %cond.load61
; AVX1-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7]
+; AVX1-NEXT: vmovss {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000
; AVX1-NEXT: je LBB8_36
@@ -2255,7 +2255,7 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32
; AVX1-NEXT: je LBB8_50
; AVX1-NEXT: LBB8_49: ## %cond.load93
; AVX1-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7]
+; AVX1-NEXT: vmovss {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000
; AVX1-NEXT: je LBB8_52
diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll
index 559a7ec0930b9..324a371632c45 100644
--- a/llvm/test/CodeGen/X86/masked_gather.ll
+++ b/llvm/test/CodeGen/X86/masked_gather.ll
@@ -65,7 +65,7 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x ptr> %ptr, <4 x i32> %trigger, <
; AVX1-NEXT: # %bb.1: # %cond.load
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; AVX1-NEXT: .LBB0_2: # %else
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB0_4
@@ -105,7 +105,7 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x ptr> %ptr, <4 x i32> %trigger, <
; AVX2-NEXT: # %bb.1: # %cond.load
; AVX2-NEXT: vmovq %xmm0, %rcx
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT: vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; AVX2-NEXT: .LBB0_2: # %else
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB0_4
@@ -254,7 +254,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(ptr %base, <4 x i32> %idx, <4 x i32
; AVX1-NEXT: # %bb.1: # %cond.load
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; AVX1-NEXT: .LBB1_2: # %else
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB1_4
@@ -299,7 +299,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(ptr %base, <4 x i32> %idx, <4 x i32
; AVX2-NEXT: # %bb.1: # %cond.load
; AVX2-NEXT: vmovq %xmm0, %rcx
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT: vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; AVX2-NEXT: .LBB1_2: # %else
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB1_4
@@ -451,7 +451,7 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(ptr %base, <4 x i64> %idx, <4 x i32
; AVX1-NEXT: # %bb.1: # %cond.load
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; AVX1-NEXT: .LBB2_2: # %else
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB2_4
@@ -495,7 +495,7 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(ptr %base, <4 x i64> %idx, <4 x i32
; AVX2-NEXT: # %bb.1: # %cond.load
; AVX2-NEXT: vmovq %xmm0, %rcx
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT: vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; AVX2-NEXT: .LBB2_2: # %else
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB2_4
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index af018d83d520e..4e6f666fa05de 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -3481,7 +3481,7 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x
; X86-SKX-NEXT: .LBB47_1: # %cond.load
; X86-SKX-NEXT: vmovd %xmm0, %ecx
; X86-SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-SKX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; X86-SKX-NEXT: vmovss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
; X86-SKX-NEXT: testb $2, %al
; X86-SKX-NEXT: je .LBB47_4
; X86-SKX-NEXT: .LBB47_3: # %cond.load1
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 89459a2d10177..e2e26da95b874 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -6424,7 +6424,7 @@ define <4 x i32> @mload_constmask_v4i32(ptr %addr, <4 x i32> %dst) {
; AVX1: ## %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: mload_constmask_v4i32:
diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index a1da40e7e7655..3500a24dfed18 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -235,7 +235,7 @@ define void @PR42833() {
; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
+; AVX1-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
; AVX1-NEXT: vmovdqa d+144(%rip), %xmm2
; AVX1-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2
; AVX1-NEXT: vmovups %ymm0, c+128(%rip)
@@ -317,7 +317,7 @@ define void @PR42833() {
; XOP-NEXT: vmovdqa c+144(%rip), %xmm3
; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
+; XOP-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
; XOP-NEXT: vmovdqa d+144(%rip), %xmm2
; XOP-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2
; XOP-NEXT: vmovups %ymm0, c+128(%rip)
diff --git a/llvm/test/CodeGen/X86/pr40730.ll b/llvm/test/CodeGen/X86/pr40730.ll
index 164bf203d0545..304d071e5d6e5 100644
--- a/llvm/test/CodeGen/X86/pr40730.ll
+++ b/llvm/test/CodeGen/X86/pr40730.ll
@@ -5,7 +5,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: shuffle_v8i32_0dcd3f14:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0]
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
@@ -26,7 +26,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14_constant(<8 x i32> %a0) {
; CHECK-LABEL: shuffle_v8i32_0dcd3f14_constant:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1,1,0]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5],ymm0[6,7]
diff --git a/llvm/test/CodeGen/X86/scalarize-fp.ll b/llvm/test/CodeGen/X86/scalarize-fp.ll
index ea9b69f8f5b80..ae24d3487c4b1 100644
--- a/llvm/test/CodeGen/X86/scalarize-fp.ll
+++ b/llvm/test/CodeGen/X86/scalarize-fp.ll
@@ -911,7 +911,7 @@ define <4 x float> @merge_fcmp_cmpeqss_v4f32(<4 x float> %x, <4 x float> %y) {
; AVX1-LABEL: merge_fcmp_cmpeqss_v4f32:
; AVX1: # %bb.0:
; AVX1-NEXT: vcmpeqss %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX1-NEXT: retq
;
; AVX512-LABEL: merge_fcmp_cmpeqss_v4f32:
diff --git a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
index f0af8fc29969b..5ae9055835716 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
@@ -22,7 +22,7 @@ define <4 x float> @insert_f32_firstelt(<4 x float> %x, ptr %s.addr) {
; AVX-LABEL: insert_f32_firstelt:
; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: retq
%s = load float, ptr %s.addr
%i0 = insertelement <4 x float> %x, float %s, i32 0
diff --git a/llvm/test/CodeGen/X86/sse-insertelt.ll b/llvm/test/CodeGen/X86/sse-insertelt.ll
index 34de7e65465d1..1e4fe81abc136 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt.ll
@@ -19,7 +19,7 @@ define <4 x float> @insert_f32_firstelt(<4 x float> %x, float %s) {
;
; AVX-LABEL: insert_f32_firstelt:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: retq
%i0 = insertelement <4 x float> %x, float %s, i32 0
ret <4 x float> %i0
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index b1f9872f7b6eb..2e2e78a6da51e 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -190,7 +190,7 @@ define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind
; AVX-LABEL: test_mm_cmpge_ss:
; AVX: # %bb.0:
; AVX-NEXT: vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
-; AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
@@ -232,7 +232,7 @@ define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
; AVX-LABEL: test_mm_cmpgt_ss:
; AVX: # %bb.0:
; AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
-; AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
@@ -382,7 +382,7 @@ define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind
; AVX-LABEL: test_mm_cmpnge_ss:
; AVX: # %bb.0:
; AVX-NEXT: vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
-; AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
@@ -424,7 +424,7 @@ define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
; AVX-LABEL: test_mm_cmpngt_ss:
; AVX: # %bb.0:
; AVX-NEXT: vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
-; AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
@@ -1603,7 +1603,7 @@ define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
;
; AVX-LABEL: test_mm_move_ss:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
@@ -2219,7 +2219,7 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX1-NEXT: # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X86-AVX1-NEXT: vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; X86-AVX1-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
; X86-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
; X86-AVX1-NEXT: retl # encoding: [0xc3]
;
@@ -2228,7 +2228,7 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
; X86-AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX512-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X86-AVX512-NEXT: vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; X86-AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
; X86-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
; X86-AVX512-NEXT: retl # encoding: [0xc3]
;
@@ -2243,7 +2243,7 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
; X64-AVX-LABEL: test_mm_set_ss:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X64-AVX-NEXT: vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; X64-AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
; X64-AVX-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
; X64-AVX-NEXT: retq # encoding: [0xc3]
%res0 = insertelement <4 x float> undef, float %a0, i32 0
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 47d35f3636d46..006c3006350cc 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1165,7 +1165,7 @@ define <4 x float> @insert_test5_sub_ss(<4 x float> %a, <4 x float> %b) {
; AVX-LABEL: insert_test5_sub_ss:
; AVX: # %bb.0:
; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: ret{{[l|q]}}
%1 = fsub <4 x float> %b, %a
%2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1203,7 +1203,7 @@ define <4 x float> @insert_test5_div_ss(<4 x float> %a, <4 x float> %b) {
; AVX-LABEL: insert_test5_div_ss:
; AVX: # %bb.0:
; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: ret{{[l|q]}}
%1 = fdiv <4 x float> %b, %a
%2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1241,7 +1241,7 @@ define <2 x double> @insert_test5_sub_sd(<2 x double> %a, <2 x double> %b) {
; AVX-LABEL: insert_test5_sub_sd:
; AVX: # %bb.0:
; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: ret{{[l|q]}}
%1 = fsub <2 x double> %b, %a
%2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
@@ -1279,7 +1279,7 @@ define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
; AVX-LABEL: insert_test5_div_sd:
; AVX: # %bb.0:
; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: ret{{[l|q]}}
%1 = fdiv <2 x double> %b, %a
%2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
@@ -1318,7 +1318,7 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
; X86-AVX1-NEXT: # %bb.1:
; X86-AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2
; X86-AVX1-NEXT: .LBB70_2:
-; X86-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; X86-AVX1-NEXT: retl
;
; X86-AVX512-LABEL: add_ss_mask:
@@ -1360,7 +1360,7 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
; X64-AVX1-NEXT: # %bb.1:
; X64-AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2
; X64-AVX1-NEXT: .LBB70_2:
-; X64-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X64-AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; X64-AVX1-NEXT: retq
;
; X64-AVX512-LABEL: add_ss_mask:
@@ -1412,7 +1412,7 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
; X86-AVX1-NEXT: # %bb.1:
; X86-AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm2
; X86-AVX1-NEXT: .LBB71_2:
-; X86-AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X86-AVX1-NEXT: retl
;
; X86-AVX512-LABEL: add_sd_mask:
@@ -1454,7 +1454,7 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
; X64-AVX1-NEXT: # %bb.1:
; X64-AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm2
; X64-AVX1-NEXT: .LBB71_2:
-; X64-AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X64-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X64-AVX1-NEXT: retq
;
; X64-AVX512-LABEL: add_sd_mask:
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index dbdc45abb24d6..18a6be8aaf0b6 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -631,7 +631,7 @@ define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwi
; AVX-LABEL: test_mm_cmpge_sd:
; AVX: # %bb.0:
; AVX-NEXT: vcmplesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x02]
-; AVX-NEXT: vblendpd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0d,0xc1,0x01]
+; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 2)
@@ -748,7 +748,7 @@ define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwi
; AVX-LABEL: test_mm_cmpgt_sd:
; AVX: # %bb.0:
; AVX-NEXT: vcmpltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x01]
-; AVX-NEXT: vblendpd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0d,0xc1,0x01]
+; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 1)
@@ -976,7 +976,7 @@ define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounw
; AVX-LABEL: test_mm_cmpnge_sd:
; AVX: # %bb.0:
; AVX-NEXT: vcmpnlesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x06]
-; AVX-NEXT: vblendpd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0d,0xc1,0x01]
+; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 6)
@@ -1021,7 +1021,7 @@ define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounw
; AVX-LABEL: test_mm_cmpngt_sd:
; AVX: # %bb.0:
; AVX-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x05]
-; AVX-NEXT: vblendpd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0d,0xc1,0x01]
+; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 5)
diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll
index 3e5d76eae0bb3..e1d91b407fc28 100644
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@@ -670,7 +670,7 @@ define <4 x i32> @PR19721(<4 x i32> %i) {
; AVX-LABEL: PR19721:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: ret{{[l|q]}}
;
; X64-SSE-LABEL: PR19721:
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 2d7258a49f5d0..53a10ab0c26ff 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -353,7 +353,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; X86-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX1-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X86-AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
; X86-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
; X86-AVX1-NEXT: retl ## encoding: [0xc3]
;
@@ -361,7 +361,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX512-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X86-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
@@ -373,7 +373,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
;
; X64-AVX-LABEL: blendps_not_insertps_1:
; X64-AVX: ## %bb.0:
-; X64-AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X64-AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
; X64-AVX-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
; X64-AVX-NEXT: retq ## encoding: [0xc3]
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
@@ -440,7 +440,7 @@ define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nou
;
; AVX-LABEL: blendps_not_insertps_2:
; AVX: ## %bb.0:
-; AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
; AVX-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%tmp2 = extractelement <4 x float> %t2, i32 0
@@ -1207,7 +1207,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
; AVX1-LABEL: i32_shuf_X00A:
; AVX1: ## %bb.0:
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
-; AVX1-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
+; AVX1-NEXT: vmovss %xmm0, %xmm2, %xmm0 ## encoding: [0xc5,0xea,0x10,0xc0]
; AVX1-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3]
; AVX1-NEXT: vshufps $0, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0x00]
; AVX1-NEXT: ## xmm1 = xmm1[0,0,0,0]
@@ -1218,7 +1218,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
; AVX512-LABEL: i32_shuf_X00A:
; AVX512: ## %bb.0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
-; AVX512-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
+; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm0 ## encoding: [0xc5,0xea,0x10,0xc0]
; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3]
; AVX512-NEXT: vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9]
; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
index 6625cc4f07a27..6a4e87ec3ceb8 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1216,7 +1216,7 @@ define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: popq %r12
@@ -1307,7 +1307,7 @@ define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: popq %r12
diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
index 2c3d7ceb37d03..a6e288608c87b 100644
--- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
@@ -154,7 +154,7 @@ define <4 x float> @f18(<4 x float> %a0, <8 x half> %a1) #0 {
; CHECK-LABEL: f18:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; CHECK-NEXT: ret{{[l|q]}}
%ext = extractelement <8 x half> %a1, i32 0
%cvt = call float @llvm.experimental.constrained.fpext.f32.f16(half %ext,
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
index 0a9dd78afb8cc..b40f8befa1ec9 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
@@ -31,7 +31,7 @@ define <2 x i64> @strict_vector_fptosi_v2f16_to_v2i64(<2 x half> %a) #0 {
; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2qq %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%ret = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f16(<2 x half> %a,
@@ -43,7 +43,7 @@ define <2 x i64> @strict_vector_fptoui_v2f16_to_v2i64(<2 x half> %a) #0 {
; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2uqq %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%ret = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f16(<2 x half> %a,
@@ -55,7 +55,7 @@ define <2 x i32> @strict_vector_fptosi_v2f16_to_v2i32(<2 x half> %a) #0 {
; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2dq %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%ret = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f16(<2 x half> %a,
@@ -67,7 +67,7 @@ define <2 x i32> @strict_vector_fptoui_v2f16_to_v2i32(<2 x half> %a) #0 {
; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2udq %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%ret = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f16(<2 x half> %a,
@@ -79,7 +79,7 @@ define <2 x i16> @strict_vector_fptosi_v2f16_to_v2i16(<2 x half> %a) #0 {
; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2w %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f16(<2 x half> %a,
@@ -91,7 +91,7 @@ define <2 x i16> @strict_vector_fptoui_v2f16_to_v2i16(<2 x half> %a) #0 {
; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f16(<2 x half> %a,
@@ -103,7 +103,7 @@ define <2 x i8> @strict_vector_fptosi_v2f16_to_v2i8(<2 x half> %a) #0 {
; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2w %xmm0, %xmm0
; CHECK-NEXT: vpmovwb %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
@@ -116,7 +116,7 @@ define <2 x i8> @strict_vector_fptoui_v2f16_to_v2i8(<2 x half> %a) #0 {
; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0
; CHECK-NEXT: vpmovwb %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
@@ -129,7 +129,7 @@ define <2 x i1> @strict_vector_fptosi_v2f16_to_v2i1(<2 x half> %a) #0 {
; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i1:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2w %xmm0, %xmm0
; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
; CHECK-NEXT: vpmovw2m %xmm0, %k1
@@ -145,7 +145,7 @@ define <2 x i1> @strict_vector_fptoui_v2f16_to_v2i1(<2 x half> %a) #0 {
; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i1:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: vcvttph2uw %xmm0, %xmm0
; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
; CHECK-NEXT: vpmovw2m %xmm0, %k1
diff --git a/llvm/test/CodeGen/X86/vec_extract-avx.ll b/llvm/test/CodeGen/X86/vec_extract-avx.ll
index 341a703a21bd5..4b70933334fb7 100644
--- a/llvm/test/CodeGen/X86/vec_extract-avx.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-avx.ll
@@ -119,7 +119,7 @@ define void @legal_vzmovl_2i32_8i32(ptr %in, ptr %out) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X86-NEXT: vmovaps %ymm0, (%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
@@ -128,7 +128,7 @@ define void @legal_vzmovl_2i32_8i32(ptr %in, ptr %out) {
; X64: # %bb.0:
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-NEXT: vmovaps %ymm0, (%rsi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -169,7 +169,7 @@ define void @legal_vzmovl_2f32_8f32(ptr %in, ptr %out) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X86-NEXT: vmovaps %ymm0, (%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
@@ -178,7 +178,7 @@ define void @legal_vzmovl_2f32_8f32(ptr %in, ptr %out) {
; X64: # %bb.0:
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-NEXT: vmovaps %ymm0, (%rsi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index abb85ac83464c..0538cac12cbf7 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -828,13 +828,13 @@ define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind {
; AVX-LABEL: floor_ss:
; AVX: ## %bb.0:
; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_ss:
; AVX512: ## %bb.0:
; AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: retq
%s = extractelement <4 x float> %x, i32 0
%call = call float @llvm.floor.f32(float %s)
@@ -853,13 +853,13 @@ define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind {
; AVX-LABEL: floor_sd:
; AVX: ## %bb.0:
; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_sd:
; AVX512: ## %bb.0:
; AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512-NEXT: retq
%s = extractelement <2 x double> %x, i32 0
%call = call double @llvm.floor.f64(double %s)
@@ -1372,7 +1372,7 @@ define <4 x float> @floor_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w
; AVX-NEXT: ## %bb.1:
; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
; AVX-NEXT: LBB52_2:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_mask_ss:
@@ -1414,7 +1414,7 @@ define <4 x float> @floor_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwi
; AVX-NEXT: ## %bb.1:
; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
; AVX-NEXT: LBB53_2:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_maskz_ss:
@@ -1452,7 +1452,7 @@ define <2 x double> @floor_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double
; AVX-NEXT: ## %bb.1:
; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
; AVX-NEXT: LBB54_2:
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_mask_sd:
@@ -1494,7 +1494,7 @@ define <2 x double> @floor_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nou
; AVX-NEXT: ## %bb.1:
; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
; AVX-NEXT: LBB55_2:
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_maskz_sd:
@@ -1532,7 +1532,7 @@ define <4 x float> @floor_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x flo
; AVX-NEXT: ## %bb.1:
; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
; AVX-NEXT: LBB56_2:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_mask_ss_trunc:
@@ -1572,11 +1572,11 @@ define <4 x float> @floor_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k)
; AVX-NEXT: jne LBB57_1
; AVX-NEXT: ## %bb.2:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
; AVX-NEXT: LBB57_1:
; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_maskz_ss_trunc:
@@ -1613,7 +1613,7 @@ define <2 x double> @floor_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x
; AVX-NEXT: ## %bb.1:
; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
; AVX-NEXT: LBB58_2:
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_mask_sd_trunc:
@@ -1657,7 +1657,7 @@ define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16
; AVX-NEXT: retq
; AVX-NEXT: LBB59_1:
; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_maskz_sd_trunc:
@@ -1689,7 +1689,7 @@ define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x flo
; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm3
; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_mask_ss_mask8:
@@ -1723,7 +1723,7 @@ define <4 x float> @floor_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwin
; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_maskz_ss_mask8:
@@ -1756,7 +1756,7 @@ define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x
; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm3
; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_mask_sd_mask8:
@@ -1790,7 +1790,7 @@ define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) noun
; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: floor_maskz_sd_mask8:
@@ -1818,13 +1818,13 @@ define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind {
; AVX-LABEL: ceil_ss:
; AVX: ## %bb.0:
; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_ss:
; AVX512: ## %bb.0:
; AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: retq
%s = extractelement <4 x float> %x, i32 0
%call = call float @llvm.ceil.f32(float %s)
@@ -1843,13 +1843,13 @@ define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind {
; AVX-LABEL: ceil_sd:
; AVX: ## %bb.0:
; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_sd:
; AVX512: ## %bb.0:
; AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512-NEXT: retq
%s = extractelement <2 x double> %x, i32 0
%call = call double @llvm.ceil.f64(double %s)
@@ -2362,7 +2362,7 @@ define <4 x float> @ceil_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w,
; AVX-NEXT: ## %bb.1:
; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
; AVX-NEXT: LBB78_2:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_mask_ss:
@@ -2404,7 +2404,7 @@ define <4 x float> @ceil_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwin
; AVX-NEXT: ## %bb.1:
; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
; AVX-NEXT: LBB79_2:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_maskz_ss:
@@ -2442,7 +2442,7 @@ define <2 x double> @ceil_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double>
; AVX-NEXT: ## %bb.1:
; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
; AVX-NEXT: LBB80_2:
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_mask_sd:
@@ -2484,7 +2484,7 @@ define <2 x double> @ceil_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) noun
; AVX-NEXT: ## %bb.1:
; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
; AVX-NEXT: LBB81_2:
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_maskz_sd:
@@ -2522,7 +2522,7 @@ define <4 x float> @ceil_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x floa
; AVX-NEXT: ## %bb.1:
; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
; AVX-NEXT: LBB82_2:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_mask_ss_trunc:
@@ -2562,11 +2562,11 @@ define <4 x float> @ceil_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k)
; AVX-NEXT: jne LBB83_1
; AVX-NEXT: ## %bb.2:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
; AVX-NEXT: LBB83_1:
; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_maskz_ss_trunc:
@@ -2603,7 +2603,7 @@ define <2 x double> @ceil_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x d
; AVX-NEXT: ## %bb.1:
; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
; AVX-NEXT: LBB84_2:
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_mask_sd_trunc:
@@ -2647,7 +2647,7 @@ define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %
; AVX-NEXT: retq
; AVX-NEXT: LBB85_1:
; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_maskz_sd_trunc:
@@ -2679,7 +2679,7 @@ define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x floa
; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm3
; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_mask_ss_mask8:
@@ -2713,7 +2713,7 @@ define <4 x float> @ceil_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind
; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2
; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_maskz_ss_mask8:
@@ -2746,7 +2746,7 @@ define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x d
; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm3
; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_mask_sd_mask8:
@@ -2780,7 +2780,7 @@ define <2 x double> @ceil_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounw
; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2
; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: ceil_maskz_sd_mask8:
diff --git a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
index e4304f2cc214a..e73d345d0fcd4 100644
--- a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -38,7 +38,7 @@ define i16 @test1(float %f) nounwind {
; X86_AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86_AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86_AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86_AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X86_AVX1-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86_AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; X86_AVX1-NEXT: vcvttss2si %xmm0, %eax
@@ -50,7 +50,7 @@ define i16 @test1(float %f) nounwind {
; X64_AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64_AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64_AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64_AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64_AVX1-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64_AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; X64_AVX1-NEXT: vcvttss2si %xmm0, %eax
@@ -63,7 +63,7 @@ define i16 @test1(float %f) nounwind {
; X86_AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86_AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86_AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86_AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X86_AVX512-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86_AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0
@@ -76,7 +76,7 @@ define i16 @test1(float %f) nounwind {
; X64_AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64_AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64_AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64_AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64_AVX512-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64_AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index bd5c9363794aa..a38028e87532f 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -54,7 +54,7 @@ define <4 x float> @vsel_float2(<4 x float> %v1, <4 x float> %v2) {
;
; AVX-LABEL: vsel_float2:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
entry:
%vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 1bbf92e45fc6c..01159d4135d8e 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -5034,7 +5034,7 @@ define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind {
; F16C-LABEL: fptoui_2f16_to_4i32:
; F16C: # %bb.0:
; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vcvttps2dq %xmm0, %xmm1
; F16C-NEXT: vpsrad $31, %xmm1, %xmm2
@@ -5048,7 +5048,7 @@ define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind {
; AVX512F-LABEL: fptoui_2f16_to_4i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index 0fba7de803488..fb17a5c16e937 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -51,7 +51,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm0[1,1]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,0]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
@@ -452,7 +452,7 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm5 = [7,3,7,3,7,3,7,3]
; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX2-FCP-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r9)
; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9)
; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm3[3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index bead2c94cf121..71d4641ecc8f1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -650,7 +650,7 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3]
; AVX2-FCP-NEXT: vpermps %ymm6, %ymm3, %ymm3
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
+; AVX2-FCP-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax)
; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rax)
; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rax)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
index f41123c5c3cfd..de94a3547b4bb 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
@@ -322,7 +322,7 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
; AVX-NEXT: vbroadcastsd 8(%rsi), %ymm6
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX-NEXT: vmovsd {{.*#+}} ymm0 = ymm1[0],ymm0[1]
; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm5[0],mem[0]
; AVX-NEXT: vmovaps (%rdi), %xmm5
; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0]
@@ -762,7 +762,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vbroadcastsd 40(%rsi), %ymm13
; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3]
; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3]
-; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm5[0],ymm9[1,2,3]
+; AVX-NEXT: vmovsd {{.*#+}} ymm9 = ymm5[0],ymm9[1]
; AVX-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,3,2,3]
; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],mem[0],ymm1[2],mem[2]
; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
@@ -1747,7 +1747,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm8
; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3]
; AVX-NEXT: vmovapd 32(%r8), %ymm8
-; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm8[0],ymm13[1,2,3]
+; AVX-NEXT: vmovsd {{.*#+}} ymm9 = ymm8[0],ymm13[1]
; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0],ymm8[1],ymm15[2,3]
; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1773,7 +1773,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovapd 96(%r8), %ymm0
; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
; AVX-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3]
-; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3]
+; AVX-NEXT: vmovsd {{.*#+}} ymm3 = ymm0[0],ymm3[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3]
; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3]
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
@@ -3750,7 +3750,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vbroadcastsd 248(%rcx), %ymm2
; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
; AVX-NEXT: vmovapd 224(%r8), %ymm5
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0],ymm11[1,2,3]
+; AVX-NEXT: vmovsd {{.*#+}} ymm2 = ymm5[0],ymm11[1]
; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3]
; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 1d389f9817229..8679c262e0bf0 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -877,7 +877,7 @@ define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) {
; AVX-LABEL: shuffle_v2i64_bitcast_z123:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: retq
%bitcast32 = bitcast <2 x i64> %x to <4 x float>
%shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> <float 1.000000e+00, float poison, float poison, float poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 0eb72c8bc0be4..e1eb1a6704e39 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -506,7 +506,7 @@ define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
; AVX1OR2-LABEL: shuffle_v4i32_4012:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,2]
-; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1OR2-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_4012:
@@ -618,7 +618,7 @@ define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
; AVX-LABEL: shuffle_v4f32_4zzz:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
ret <4 x float> %shuffle
@@ -1164,7 +1164,7 @@ define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
; AVX-LABEL: shuffle_v4i32_4zzz:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
ret <4 x i32> %shuffle
@@ -1202,14 +1202,14 @@ define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
; AVX1-LABEL: shuffle_v4i32_z4zz:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-SLOW-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
; AVX2-SLOW-NEXT: retq
;
@@ -1258,14 +1258,14 @@ define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
; AVX1-LABEL: shuffle_v4i32_zz4z:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-SLOW-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; AVX2-SLOW-NEXT: retq
;
@@ -2138,7 +2138,7 @@ define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
; AVX-LABEL: insert_reg_and_zero_v4f32:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%v = insertelement <4 x float> poison, float %a, i32 0
%shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index d848a8b879215..94fc982d87e50 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2376,33 +2376,21 @@ define <4 x i64> @shuffle_v4i64_0zzz_pgso(<4 x i64> %a) !prof !14 {
}
define <8 x float> @shuffle_v8f32_0zzzzzzz_pgso(<8 x float> %a) !prof !14 {
-; AVX1OR2-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
+; ALL: # %bb.0:
+; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; ALL-NEXT: retq
%b = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <8 x float> %b
}
define <8 x i32> @shuffle_v8i32_0zzzzzzz_pgso(<8 x i32> %a) !prof !14 {
-; AVX1OR2-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
+; ALL: # %bb.0:
+; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; ALL-NEXT: retq
%b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <8 x i32> %b
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index bd78dbded0705..2aa6a7ff325f2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -191,7 +191,7 @@ define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00500000:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX1-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
; AVX1-NEXT: retq
;
@@ -1422,14 +1422,14 @@ define <8 x float> @shuffle_v8f32_089abcde(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[1,2],ymm2[4,6],ymm1[5,6]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX1-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_089abcde:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [u,0,1,2,3,4,5,6]
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8f32_089abcde:
@@ -1821,7 +1821,7 @@ define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00500000:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX1-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
; AVX1-NEXT: retq
;
@@ -3121,14 +3121,14 @@ define <8 x i32> @shuffle_v8i32_089abcde(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[1,2],ymm2[4,6],ymm1[5,6]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX1-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_089abcde:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [u,0,1,2,3,4,5,6]
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_089abcde:
@@ -3402,7 +3402,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_0dcd3f14:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 38920aa5d7a12..f4f4842e4c69d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -435,7 +435,7 @@ define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
; CHECK-LABEL: combine_pshufb_as_vzmovl_32:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-NEXT: ret{{[l|q]}}
%1 = bitcast <8 x float> %a0 to <32 x i8>
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 12d494c32b656..0570e2f580c1b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -75,7 +75,7 @@ define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) {
;
; AVX-LABEL: combine_pshufb_as_movss:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 3, i32 2, i32 1>
%2 = bitcast <4 x float> %1 to <16 x i8>
@@ -137,7 +137,7 @@ define <4 x float> @combine_pshufb_as_vzmovl_32(<4 x float> %a0) {
; AVX-LABEL: combine_pshufb_as_vzmovl_32:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%1 = bitcast <4 x float> %a0 to <16 x i8>
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index f53c7a3370174..e8bf5ec2b49a6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -106,7 +106,7 @@ define <4 x float> @combine_vpermil2ps_blend_with_zero(<4 x float> %a0, <4 x flo
; CHECK-LABEL: combine_vpermil2ps_blend_with_zero:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; CHECK-NEXT: ret{{[l|q]}}
%res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 8, i32 1, i32 2, i32 3>, i8 2)
ret <4 x float> %res0
diff --git a/llvm/test/CodeGen/X86/vector-zmov.ll b/llvm/test/CodeGen/X86/vector-zmov.ll
index 09835d797d172..2f84723b3c081 100644
--- a/llvm/test/CodeGen/X86/vector-zmov.ll
+++ b/llvm/test/CodeGen/X86/vector-zmov.ll
@@ -63,7 +63,7 @@ define <4 x i32> @load_zmov_4i32_to_0zzz_volatile(ptr%ptr) {
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovaps (%rdi), %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
entry:
%X = load volatile <4 x i32>, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index be6ee8f689958..9851fe64847de 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -313,7 +313,7 @@ define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
;
; AVX-LABEL: test18:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
@@ -332,7 +332,7 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
;
; AVX-LABEL: test19:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %1
@@ -390,7 +390,7 @@ define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
;
; AVX-LABEL: test22:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
@@ -410,7 +410,7 @@ define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
;
; AVX-LABEL: test23:
; AVX: # %bb.0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %1
>From 135e8bd9b50ca36b9d65e8cb4d9353fd3fa59c84 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Sun, 8 Jun 2025 22:49:48 +0800
Subject: [PATCH 2/5] [X86] Tune X86FixupInstTuning to improve scheduling for
MOVSS/D blend patterns
---
llvm/lib/Target/X86/X86FixupInstTuning.cpp | 13 -------------
1 file changed, 13 deletions(-)
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 7f4d73e89f472..748ebcc8a5569 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -222,8 +222,6 @@ bool X86FixupInstTuningPass::processInstruction(
return ProcessUNPCKToIntDomain(NewOpc);
};
-
-
auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
if (!MI.getOperand(NumOperands - 1).isImm() ||
MI.getOperand(NumOperands - 1).getImm() != 1)
@@ -237,12 +235,7 @@ bool X86FixupInstTuningPass::processInstruction(
MI.removeOperand(NumOperands - 1);
return true;
};
-
-
-
-
switch (Opc) {
-
case X86::VBLENDPSrri:
case X86::VBLENDPSYrri:
case X86::VBLENDMPSZ128rrkz:
@@ -253,7 +246,6 @@ bool X86FixupInstTuningPass::processInstruction(
return false;
return ProcessBLENDToMOV(X86::VMOVSSrr);
}
-
case X86::VBLENDPDrri:
case X86::VBLENDPDYrri:
case X86::VBLENDMPDZ128rrkz:
@@ -264,11 +256,6 @@ bool X86FixupInstTuningPass::processInstruction(
return false;
return ProcessBLENDToMOV(X86::VMOVSDrr);
}
-
-
-
-
-
case X86::VPERMILPDri:
return ProcessVPERMILPDri(X86::VSHUFPDrri);
case X86::VPERMILPDYri:
>From 934de0305b0678eca16da6a6d8089feea9df85e3 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Tue, 10 Jun 2025 20:41:38 +0800
Subject: [PATCH 3/5] [X86] Update llc tests after optimizing VBLENDPS to
VMOVSS
---
llvm/lib/Target/X86/X86FixupInstTuning.cpp | 16 ++++------------
llvm/test/CodeGen/X86/avx-insertelt.ll | 2 +-
llvm/test/CodeGen/X86/masked_expandload.ll | 12 ++++++------
llvm/test/CodeGen/X86/oddsubvector.ll | 4 ++--
.../X86/vector-interleaved-store-i64-stride-5.ll | 10 +++++-----
llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll | 12 ++++++------
6 files changed, 24 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 748ebcc8a5569..044f51c985ec7 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -223,8 +223,7 @@ bool X86FixupInstTuningPass::processInstruction(
};
auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
- if (!MI.getOperand(NumOperands - 1).isImm() ||
- MI.getOperand(NumOperands - 1).getImm() != 1)
+ if (MI.getOperand(NumOperands - 1).getImm() != 1)
return false;
bool Force = MF.getFunction().hasOptSize();
@@ -235,22 +234,15 @@ bool X86FixupInstTuningPass::processInstruction(
MI.removeOperand(NumOperands - 1);
return true;
};
+
switch (Opc) {
- case X86::VBLENDPSrri:
- case X86::VBLENDPSYrri:
- case X86::VBLENDMPSZ128rrkz:
- case X86::VBLENDMPSZ256rrkz:
- case X86::VBLENDMPSZrrkz: {
+ case X86::VBLENDPSrri: {
int Imm = MI.getOperand(NumOperands - 1).getImm();
if (Imm != 1)
return false;
return ProcessBLENDToMOV(X86::VMOVSSrr);
}
- case X86::VBLENDPDrri:
- case X86::VBLENDPDYrri:
- case X86::VBLENDMPDZ128rrkz:
- case X86::VBLENDMPDZ256rrkz:
- case X86::VBLENDMPDZrrkz: {
+ case X86::VBLENDPDrri: {
int Imm = MI.getOperand(NumOperands - 1).getImm();
if (Imm != 1)
return false;
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
index 81f3058f19579..c23fb00843a8c 100644
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -8,7 +8,7 @@ define <8 x float> @insert_f32_firstelt_of_low_subvector(<8 x float> %x, float %
; ALL-LABEL: insert_f32_firstelt_of_low_subvector:
; ALL: # %bb.0:
; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
-; ALL-NEXT: vmovss {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
; ALL-NEXT: retq
%i0 = insertelement <8 x float> %x, float %s, i32 0
ret <8 x float> %i0
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index 234c7190256c4..b7fe8e053fa15 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -204,7 +204,7 @@ define <4 x double> @expandload_v4f64_v4i64(ptr %base, <4 x double> %src0, <4 x
; AVX1-NEXT: retq
; AVX1-NEXT: LBB1_1: ## %cond.load
; AVX1-NEXT: vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
-; AVX1-NEXT: vmovsd {{.*#+}} ymm0 = ymm1[0],ymm0[1]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je LBB1_4
@@ -245,7 +245,7 @@ define <4 x double> @expandload_v4f64_v4i64(ptr %base, <4 x double> %src0, <4 x
; AVX2-NEXT: retq
; AVX2-NEXT: LBB1_1: ## %cond.load
; AVX2-NEXT: vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
-; AVX2-NEXT: vmovsd {{.*#+}} ymm0 = ymm1[0],ymm0[1]
+; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je LBB1_4
@@ -2111,7 +2111,7 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32
; AVX1-NEXT: retq
; AVX1-NEXT: LBB8_1: ## %cond.load
; AVX1-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT: vmovss {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je LBB8_4
@@ -2159,7 +2159,7 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32
; AVX1-NEXT: je LBB8_18
; AVX1-NEXT: LBB8_17: ## %cond.load29
; AVX1-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT: vmovss {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $512, %eax ## imm = 0x200
; AVX1-NEXT: je LBB8_20
@@ -2207,7 +2207,7 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32
; AVX1-NEXT: je LBB8_34
; AVX1-NEXT: LBB8_33: ## %cond.load61
; AVX1-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT: vmovss {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000
; AVX1-NEXT: je LBB8_36
@@ -2255,7 +2255,7 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32
; AVX1-NEXT: je LBB8_50
; AVX1-NEXT: LBB8_49: ## %cond.load93
; AVX1-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT: vmovss {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000
; AVX1-NEXT: je LBB8_52
diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index 3500a24dfed18..a1da40e7e7655 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -235,7 +235,7 @@ define void @PR42833() {
; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
; AVX1-NEXT: vmovdqa d+144(%rip), %xmm2
; AVX1-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2
; AVX1-NEXT: vmovups %ymm0, c+128(%rip)
@@ -317,7 +317,7 @@ define void @PR42833() {
; XOP-NEXT: vmovdqa c+144(%rip), %xmm3
; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; XOP-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
+; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
; XOP-NEXT: vmovdqa d+144(%rip), %xmm2
; XOP-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2
; XOP-NEXT: vmovups %ymm0, c+128(%rip)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
index de94a3547b4bb..f41123c5c3cfd 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
@@ -322,7 +322,7 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
; AVX-NEXT: vbroadcastsd 8(%rsi), %ymm6
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3]
-; AVX-NEXT: vmovsd {{.*#+}} ymm0 = ymm1[0],ymm0[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm5[0],mem[0]
; AVX-NEXT: vmovaps (%rdi), %xmm5
; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0]
@@ -762,7 +762,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vbroadcastsd 40(%rsi), %ymm13
; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3]
; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3]
-; AVX-NEXT: vmovsd {{.*#+}} ymm9 = ymm5[0],ymm9[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm5[0],ymm9[1,2,3]
; AVX-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,3,2,3]
; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],mem[0],ymm1[2],mem[2]
; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
@@ -1747,7 +1747,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm8
; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3]
; AVX-NEXT: vmovapd 32(%r8), %ymm8
-; AVX-NEXT: vmovsd {{.*#+}} ymm9 = ymm8[0],ymm13[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm8[0],ymm13[1,2,3]
; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0],ymm8[1],ymm15[2,3]
; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1773,7 +1773,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovapd 96(%r8), %ymm0
; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
; AVX-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3]
-; AVX-NEXT: vmovsd {{.*#+}} ymm3 = ymm0[0],ymm3[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3]
; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3]
; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3]
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
@@ -3750,7 +3750,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vbroadcastsd 248(%rcx), %ymm2
; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
; AVX-NEXT: vmovapd 224(%r8), %ymm5
-; AVX-NEXT: vmovsd {{.*#+}} ymm2 = ymm5[0],ymm11[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0],ymm11[1,2,3]
; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3]
; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 2aa6a7ff325f2..56256b4945334 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -191,7 +191,7 @@ define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00500000:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
; AVX1-NEXT: retq
;
@@ -1422,14 +1422,14 @@ define <8 x float> @shuffle_v8f32_089abcde(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[1,2],ymm2[4,6],ymm1[5,6]
-; AVX1-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_089abcde:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [u,0,1,2,3,4,5,6]
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8f32_089abcde:
@@ -1821,7 +1821,7 @@ define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00500000:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
; AVX1-NEXT: retq
;
@@ -3121,14 +3121,14 @@ define <8 x i32> @shuffle_v8i32_089abcde(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[1,2],ymm2[4,6],ymm1[5,6]
-; AVX1-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_089abcde:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [u,0,1,2,3,4,5,6]
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vmovss {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_089abcde:
>From 30577d58e462d89367d29bc9995921de42cfc1ae Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Tue, 10 Jun 2025 21:31:36 +0800
Subject: [PATCH 4/5] [X86] Update llc tests after optimizing VBLENDPS to
VMOVSS
---
llvm/lib/Target/X86/X86FixupInstTuning.cpp | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 044f51c985ec7..958a3a9ffdfda 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -225,9 +225,7 @@ bool X86FixupInstTuningPass::processInstruction(
auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
if (MI.getOperand(NumOperands - 1).getImm() != 1)
return false;
-
bool Force = MF.getFunction().hasOptSize();
-
if (!Force && !NewOpcPreferable(MovOpc))
return false;
MI.setDesc(TII->get(MovOpc));
>From 1604a5e4989baaba8a490373929a6b1999d9486c Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Tue, 10 Jun 2025 21:42:49 +0800
Subject: [PATCH 5/5] [X86] Update llc tests after optimizing VBLENDPS to
VMOVSS
---
llvm/lib/Target/X86/X86FixupInstTuning.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 958a3a9ffdfda..92b793f10a998 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -221,7 +221,7 @@ bool X86FixupInstTuningPass::processInstruction(
auto ProcessUNPCKPS = [&](unsigned NewOpc) -> bool {
return ProcessUNPCKToIntDomain(NewOpc);
};
-
+
auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
if (MI.getOperand(NumOperands - 1).getImm() != 1)
return false;
More information about the llvm-commits
mailing list